1
2
3
4 package net.sourceforge.pmd.cpd;
5
6 import java.util.List;
7
8
9
10
11
12
13 public abstract class AbstractTokenizer implements Tokenizer {
14
15
16
17
18 protected List<String> stringToken;
19
20 protected List<String> ignorableCharacter;
21
22
23
24
25 protected List<String> ignorableStmt;
26
27 protected char oneLineCommentChar = '#';
28
29
30
31 private List<String> code;
32 private int lineNumber = 0;
33 private String currentLine;
34
35 protected boolean spanMultipleLinesString = true;
36
37 protected Character spanMultipleLinesLineContinuationCharacter = null;
38
39 private boolean downcaseString = true;
40
41 public void tokenize(SourceCode tokens, Tokens tokenEntries) {
42 code = tokens.getCode();
43
44 for (lineNumber = 0; lineNumber < code.size(); lineNumber++) {
45 currentLine = code.get(lineNumber);
46 int loc = 0;
47 while (loc < currentLine.length()) {
48 StringBuilder token = new StringBuilder();
49 loc = getTokenFromLine(token, loc);
50 if (token.length() > 0 && !isIgnorableString(token.toString())) {
51 if (downcaseString) {
52 token = new StringBuilder(token.toString().toLowerCase());
53 }
54
55
56
57
58 tokenEntries.add(new TokenEntry(token.toString(), tokens.getFileName(), lineNumber));
59
60 }
61 }
62 }
63 tokenEntries.add(TokenEntry.getEOF());
64 }
65
66 private int getTokenFromLine(StringBuilder token, int loc) {
67 for (int j = loc; j < currentLine.length(); j++) {
68 char tok = currentLine.charAt(j);
69 if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
70 if (isComment(tok)) {
71 if (token.length() > 0) {
72 return j;
73 } else {
74 return getCommentToken(token, loc);
75 }
76 } else if (isString(tok)) {
77 if (token.length() > 0) {
78 return j;
79
80 } else {
81
82 return parseString(token, j, tok);
83 }
84 } else {
85 token.append(tok);
86 }
87 } else {
88 if (token.length() > 0) {
89 return j;
90 }
91 }
92 loc = j;
93 }
94 return loc + 1;
95 }
96
97 private int parseString(StringBuilder token, int loc, char stringDelimiter) {
98 boolean escaped = false;
99 boolean done = false;
100 char tok = ' ';
101 while (loc < currentLine.length() && !done) {
102 tok = currentLine.charAt(loc);
103 if (escaped && tok == stringDelimiter) {
104 escaped = false;
105 } else if (tok == stringDelimiter && token.length() > 0) {
106
107 done = true;
108 } else if (tok == '\\') {
109 escaped = true;
110 } else {
111 escaped = false;
112 }
113
114 token.append(tok);
115 loc++;
116 }
117
118 if (!done &&
119 loc >= currentLine.length() &&
120
121
122
123 spanMultipleLinesString &&
124
125 lineNumber < code.size() - 1
126
127 ) {
128
129
130 if (spanMultipleLinesLineContinuationCharacter != null && token.length() > 0
131 && token.charAt(token.length() - 1) == spanMultipleLinesLineContinuationCharacter.charValue()) {
132 token.deleteCharAt(token.length() - 1);
133 }
134
135 currentLine = code.get(++lineNumber);
136
137 loc = parseString(token, 0, stringDelimiter);
138 }
139 return loc + 1;
140 }
141
142 private boolean ignoreCharacter(char tok) {
143 return ignorableCharacter.contains(String.valueOf(tok));
144 }
145
146 private boolean isString(char tok) {
147 return stringToken.contains(String.valueOf(tok));
148 }
149
150 private boolean isComment(char tok) {
151 return tok == oneLineCommentChar;
152 }
153
154 private int getCommentToken(StringBuilder token, int loc) {
155 while (loc < currentLine.length()) {
156 token.append(currentLine.charAt(loc++));
157 }
158 return loc;
159 }
160
161 private boolean isIgnorableString(String token) {
162 return ignorableStmt.contains(token);
163 }
164 }