View Javadoc
1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    */
4   package net.sourceforge.pmd.cpd;
5   
6   import java.io.BufferedReader;
7   import java.io.CharArrayReader;
8   import java.io.IOException;
9   
10  import org.apache.commons.io.IOUtils;
11  
12  /**
13   * This class does a best-guess try-anything tokenization.
14   *
15   * @author jheintz
16   */
17  public class CsTokenizer implements Tokenizer {
18  
19      @Override
20      public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
21          BufferedReader reader = new BufferedReader(new CharArrayReader(sourceCode.getCodeBuffer().toString()
22                  .toCharArray()));
23          try {
24              int ic = reader.read();
25              int line = 1;
26              char c;
27              StringBuilder b;
28              while (ic != -1) {
29                  c = (char) ic;
30                  switch (c) {
31                  // new line
32                  case '\n':
33                      line++;
34                      ic = reader.read();
35                      break;
36  
37                  // white space
38                  case ' ':
39                  case '\t':
40                  case '\r':
41                      ic = reader.read();
42                      break;
43  
44                  // ignore semicolons
45                  case ';':
46                      ic = reader.read();
47                      break;
48  
49                  // < << <= <<= > >> >= >>=
50                  case '<':
51                  case '>':
52                      ic = reader.read();
53                      if (ic == '=') {
54                          tokenEntries.add(new TokenEntry(c + "=", sourceCode.getFileName(), line));
55                          ic = reader.read();
56                      } else if (ic == c) {
57                          ic = reader.read();
58                          if (ic == '=') {
59                              tokenEntries.add(new TokenEntry(c + c + "=", sourceCode
60                                      .getFileName(), line));
61                              ic = reader.read();
62                          } else {
63                              tokenEntries.add(new TokenEntry(String.valueOf(c) + c, sourceCode
64                                      .getFileName(), line));
65                          }
66                      } else {
67                          tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
68                      }
69                      break;
70  
71                  // = == & &= && | |= || + += ++ - -= --
72                  case '=':
73                  case '&':
74                  case '|':
75                  case '+':
76                  case '-':
77                      ic = reader.read();
78                      if (ic == '=' || ic == c) {
79                          tokenEntries.add(new TokenEntry(c + String.valueOf((char) ic), sourceCode
80                                  .getFileName(), line));
81                          ic = reader.read();
82                      } else {
83                          tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
84                      }
85                      break;
86  
87                  // ! != * *= % %= ^ ^= ~ ~=
88                  case '!':
89                  case '*':
90                  case '%':
91                  case '^':
92                  case '~':
93                      ic = reader.read();
94                      if (ic == '=') {
95                          tokenEntries.add(new TokenEntry(c + "=", sourceCode.getFileName(), line));
96                          ic = reader.read();
97                      } else {
98                          tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
99                      }
100                     break;
101 
102                 // strings & chars
103                 case '"':
104                 case '\'':
105                     int beginLine = line;
106                     b = new StringBuilder();
107                     b.append(c);
108                     while ((ic = reader.read()) != c) {
109                         if (ic == -1) {
110                             break;
111                         }
112                         b.append((char) ic);
113                         if (ic == '\\') {
114                             int next = reader.read();
115                             if (next != -1) {
116                                 b.append((char) next);
117 
118                                 if (next == '\n') {
119                                     line++;
120                                 }
121                             }
122                         } else if (ic == '\n') {
123                             line++;
124                         }
125                     }
126                     if (ic != -1) {
127                         b.append((char) ic);
128                     }
129                     tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), beginLine));
130                     ic = reader.read();
131                     break;
132 
133                 // / /= /*...*/ //...
134                 case '/':
135                     switch (c = (char) (ic = reader.read())) {
136                     case '*':
137                         //int beginLine = line;
138                         int state = 1;
139                         b = new StringBuilder();
140                         b.append("/*");
141 
142                         while ((ic = reader.read()) != -1) {
143                             c = (char) ic;
144                             b.append(c);
145 
146                             if (c == '\n') {
147                                 line++;
148                             }
149 
150                             if (state == 1) {
151                                 if (c == '*') {
152                                     state = 2;
153                                 }
154                             } else {
155                                 if (c == '/') {
156                                     ic = reader.read();
157                                     break;
158                                 } else if (c != '*') {
159                                     state = 1;
160                                 }
161                             }
162                         }
163                         // ignore the /* comment
164                         // tokenEntries.add(new TokenEntry(b.toString(),
165                         // sourceCode.getFileName(), beginLine));
166                         break;
167 
168                     case '/':
169                         b = new StringBuilder();
170                         b.append("//");
171                         while ((ic = reader.read()) != '\n') {
172                             if (ic == -1) {
173                                 break;
174                             }
175                             b.append((char) ic);
176                         }
177                         // ignore the // comment
178                         // tokenEntries.add(new TokenEntry(b.toString(),
179                         // sourceCode.getFileName(), line));
180                         break;
181 
182                     case '=':
183                         tokenEntries.add(new TokenEntry("/=", sourceCode.getFileName(), line));
184                         ic = reader.read();
185                         break;
186 
187                     default:
188                         tokenEntries.add(new TokenEntry("/", sourceCode.getFileName(), line));
189                         break;
190                     }
191                     break;
192 
193                 default:
194                     // [a-zA-Z_][a-zA-Z_0-9]*
195                     if (Character.isJavaIdentifierStart(c)) {
196                         b = new StringBuilder();
197                         do {
198                             b.append(c);
199                             c = (char) (ic = reader.read());
200                         } while (Character.isJavaIdentifierPart(c));
201                         tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
202                     }
203                     // numbers
204                     else if (Character.isDigit(c) || c == '.') {
205                         b = new StringBuilder();
206                         do {
207                             b.append(c);
208                             if (c == 'e' || c == 'E') {
209                                 c = (char) (ic = reader.read());
210                                 if ("1234567890-".indexOf(c) == -1) {
211                                     break;
212                                 }
213                                 b.append(c);
214                             }
215                             c = (char) (ic = reader.read());
216                         } while ("1234567890.iIlLfFdDsSuUeExX".indexOf(c) != -1);
217 
218                         tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
219                     }
220                     // anything else
221                     else {
222                         tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
223                         ic = reader.read();
224                         break;
225                     }
226                 }
227             }
228         } catch (IOException e) {
229             e.printStackTrace();
230         } finally {
231             IOUtils.closeQuietly(reader);
232             tokenEntries.add(TokenEntry.getEOF());
233         }
234     }
235 }