用Java语言实现简单的词法分析器

whatthe89 7年前
   <p>编译原理中的词法分析算是很重要的一个部分,原理比较简单,不过网上大部分都是用C语言或者C++来编写,笔者近期在学习Java,故用Java语言实现了简单的词法分析器。</p>    <p>要分析的代码段如下:</p>    <p style="text-align:center"><img src="https://simg.open-open.com/show/f71b89e77d947c205dcfeb1abd72d8a7.png"></p>    <p style="text-align:center">输入文件.PNG</p>    <p>输出结果如下:</p>    <p style="text-align:center"><img src="https://simg.open-open.com/show/b3f3272d5a529a886934f2db59bcdbc9.png"></p>    <p style="text-align:center">输出结果(a).PNG</p>    <p style="text-align:center"><img src="https://simg.open-open.com/show/dbb167dbbfd02f78a1cba7ab7095df70.png"></p>    <p style="text-align:center">输出结果(b).PNG</p>    <p style="text-align:center"><img src="https://simg.open-open.com/show/be0b29259eed733ed3059cde359ec7c6.png"></p>    <p style="text-align:center">输出结果(c).PNG</p>    <p>括号里是一个二元式:(单词类别编码,单词位置编号)</p>    <p>代码如下:</p>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    import java.io.*;    /*   * 主程序   */  public class Main {      public static void main(String[] args) throws IOException {          Lexer lexer = new Lexer();          lexer.printToken();          lexer.printSymbolsTable();      }  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    import java.io.*;  import java.util.*;    /*   * 词法分析并输出   */  public class Lexer {      /*记录行号*/      public static int line = 1;      /*存放最新读入的字符*/      char character = ' ';        /*保留字*/      Hashtable<String, KeyWord> keywords = new Hashtable<String, KeyWord>();      /*token序列*/      private ArrayList<Token> tokens = new ArrayList<Token>();      /*符号表*/      private ArrayList<Symbol> symtable = new ArrayList<Symbol>();        /*读取文件变量*/      BufferedReader reader = null;      /*保存当前是否读取到了文件的结尾*/      private Boolean isEnd = false;        /* 是否读取到文件的结尾 */      public Boolean getReaderState() {          return this.isEnd;      }        /*打印tokens序列*/      public void printToken() throws IOException {          FileWriter writer = new FileWriter("E:\\lex.txt");          System.out.println("词法分析结果如下:");          System.out.print("杜悦-2015220201031\r\n\n");          writer.write("杜悦-2015220201031\r\n\r\n");          while (getReaderState() == false) {              Token tok = scan();              String str = "line " + tok.line + "\t(" + tok.tag + "," + tok.pos + ")\t\t"                      + tok.name + ": " + tok.toString() + "\r\n";              writer.write(str);              System.out.print(str);          }          writer.flush();        }        /*打印符号表*/      public void printSymbolsTable() throws IOException {          FileWriter writer = new FileWriter("E:\\symtab1.txt");          System.out.print("\r\n\r\n符号表\r\n");          System.out.print("编号\t行号\t名称\r\n");          writer.write("符号表\r\n");          writer.write("编号 " + "\t行号 " + "\t名称 \r\n");          Iterator<Symbol> e = symtable.iterator();          while (e.hasNext()) {              Symbol symbol = e.next();              String desc = symbol.pos + "\t" + symbol.line + "\t" + symbol.toString();              System.out.print(desc + "\r\n");              writer.write(desc + "\r\n");          }            writer.flush();      }        /*打印错误*/      public void printError(Token tok) throws IOException{          FileWriter writer = new FileWriter("E:\\error.txt");          System.out.print("\r\n\r\n错误词法如下:\r\n");          writer.write("错误词法如下:\r\n");          String str = "line " + tok.line + "\t(" + tok.tag + "," + tok.pos + ")\t\t"                  + tok.name + ": " + tok.toString() + "\r\n";          writer.write(str);      }        /*添加保留字*/      void reserve(KeyWord w) {          keywords.put(w.lexme, w);      }        public Lexer() {          /*初始化读取文件变量*/          try {              reader = new BufferedReader(new FileReader("E:\\输入.txt"));          } catch (IOException e) {              System.out.print(e);          }            /*添加保留字*/          this.reserve(KeyWord.begin);          this.reserve(KeyWord.end);          this.reserve(KeyWord.integer);          this.reserve(KeyWord.function);          this.reserve(KeyWord.read);          this.reserve(KeyWord.write);          this.reserve(KeyWord.aIf);          this.reserve(KeyWord.aThen);          this.reserve(KeyWord.aElse);      }        /*按字符读*/      public void readch() throws IOException {          character = (char) reader.read();          if ((int) character == 0xffff) {              this.isEnd = true;          }      }        /*判断是否匹配*/      public Boolean readch(char ch) throws IOException {          readch();          if (this.character != ch) {              return false;          }            this.character = ' ';          return true;      }        /*数字的识别*/      public Boolean isDigit() throws IOException {          if (Character.isDigit(character)) {              int value = 0;              while (Character.isDigit(character)) {                  value = 10 * value + Character.digit(character, 10);                  readch();              }                Num n = new Num(value);              n.line = line;              tokens.add(n);              return true;          } else              return false;      }        /*保留字、标识符的识别*/      public Boolean isLetter() throws IOException {          if (Character.isLetter(character)) {              StringBuffer sb = new StringBuffer();                /*首先得到整个的一个分割*/              while (Character.isLetterOrDigit(character)) {                  sb.append(character);                  readch();              }                /*判断是保留字还是标识符*/              String s = sb.toString();              KeyWord w = keywords.get(s);                /*如果是保留字的话,w不应该是空的*/              if (w != null) {                  w.line = line;                  tokens.add(w);              } else {                  /*否则就是标识符,此处多出记录标识符编号的语句*/                  Symbol sy = new Symbol(s);                  Symbol mark = sy;           //用于标记已存在标识符                  Boolean isRepeat = false;                  sy.line = line;                  for (Symbol i : symtable) {                      if (sy.toString().equals(i.toString())) {                          mark = i;                          isRepeat = true;                      }                  }                  if (!isRepeat) {                      sy.pos = symtable.size() + 1;                      symtable.add(sy);                  } else if (isRepeat) {                      sy.pos = mark.pos;                  }                  tokens.add(sy);              }              return true;          } else              return false;      }        /*符号的识别*/      public Boolean isSign() throws IOException {          switch (character) {              case '#':                  readch();                  AllEnd.allEnd.line = line;                  tokens.add(AllEnd.allEnd);                  return true;              case '\r':                  if (readch('\n')) {                      readch();                      LineEnd.lineEnd.line = line;                      tokens.add(LineEnd.lineEnd);                      line++;                      return true;                  }              case '(':                  readch();                  Delimiter.lpar.line = line;                  tokens.add(Delimiter.lpar);                  return true;              case ')':                  readch();                  Delimiter.rpar.line = line;                  tokens.add(Delimiter.rpar);                  return true;              case ';':                  readch();                  Delimiter.sem.line = line;                  tokens.add(Delimiter.sem);                  return true;              case '+':                  readch();                  CalcWord.add.line = line;                  tokens.add(CalcWord.add);                  return true;              case '-':                  readch();                  CalcWord.sub.line = line;                  tokens.add(CalcWord.sub);                  return true;              case '*':                  readch();                  CalcWord.mul.line = line;                  tokens.add(CalcWord.mul);                  return true;              case '/':                  readch();                  CalcWord.div.line = line;                  tokens.add(CalcWord.div);                  return true;              case ':':                  if (readch('=')) {                      readch();                      CalcWord.assign.line = line;                      tokens.add(CalcWord.assign);                      return true;                  }                  break;              case '>':                  if (readch('=')) {                      readch();                      CalcWord.ge.line = line;                      tokens.add(CalcWord.ge);                      return true;                  }                  break;              case '<':                  if (readch('=')) {                      readch();                      CalcWord.le.line = line;                      tokens.add(CalcWord.le);                      return true;                  }                  break;              case '!':                  if (readch('=')) {                      readch();                      CalcWord.ne.line = line;                      tokens.add(CalcWord.ne);                      return true;                  }                  break;          }          return false;      }          /*下面开始分割关键字,标识符等信息*/      public Token scan() throws IOException {          Token tok;          while (character == ' ')              readch();          if (isDigit() || isSign() || isLetter()) {              tok = tokens.get(tokens.size() - 1);          } else {              tok = new Token(character);              printError(tok);          }          return tok;      }  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /*   * Token父类   */  public class Token {      public final int tag;      public int line = 1;      public String name = "";      public int pos = 0;        public Token(int t) {          this.tag = t;      }        public String toString() {          return "" + (char) tag;      }    }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /*   * 单词类别赋值   */  public class Tag {      public final static int              BEGIN = 1,          //保留字              END = 2,            //保留字              INTEGER = 3,        //保留字              FUNCTION = 4,       //保留字              READ = 5,           //保留字              WRITE = 6,          //保留字              IF = 7,             //保留字              THEN = 8,           //保留字              ELSE = 9,           //保留字              SYMBOL = 11,        //标识符              CONSTANT = 12,      //常数              ADD = 13,           //运算符 "+"              SUB = 14,           //运算符 "-"              MUL = 15,           //运算符 "*"              DIV = 16,           //运算符 "/"              LE = 18,            //运算符 "<="              GE = 19,            //运算符 ">="              NE = 20,            //运算符 "!="              ASSIGN = 23,        //运算符 ":="              LPAR = 24,          //界符 "("              RPAR = 25,          //界符 ")"              SEM = 26,           //界符 ";"              LINE_END = 27,      //行尾符              ALL_END = 28;       //结尾符 "#"  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /**   * 保留字   */  public class KeyWord extends Token {      public String lexme = "";        public KeyWord(String s, int t) {          super(t);          this.lexme = s;          this.name = "保留字";      }        public String toString() {          return this.lexme;      }        public static final KeyWord              begin = new KeyWord("begin", Tag.BEGIN),              end = new KeyWord("end", Tag.END),              integer = new KeyWord("integer", Tag.INTEGER),              function = new KeyWord("function", Tag.FUNCTION),              read = new KeyWord("read", Tag.READ),              write = new KeyWord("write", Tag.WRITE),              aIf = new KeyWord("if", Tag.IF),              aThen = new KeyWord("then", Tag.THEN),              aElse = new KeyWord("else", Tag.ELSE);  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /*   * 标识符   */  public class Symbol extends Token {      public String lexme = "";        public Symbol(String s) {          super(Tag.SYMBOL);          this.lexme = s;          this.name = "标识符";      }        public String toString() {          return this.lexme;      }    }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /**   * 运算符   */  public class CalcWord extends Token {      public String lexme = "";        public CalcWord(String s, int t) {          super(t);          this.lexme = s;          this.name = "运算符";      }        public String toString() {          return this.lexme;      }        public static final CalcWord              add = new CalcWord("+", Tag.ADD),              sub = new CalcWord("-", Tag.SUB),              mul = new CalcWord("*", Tag.MUL),              div = new CalcWord("/", Tag.DIV),              le = new CalcWord("<=", Tag.LE),              ge = new CalcWord(">=", Tag.GE),              ne = new CalcWord("!=", Tag.NE),              assign = new CalcWord(":=", Tag.ASSIGN);  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /**   * 界符   */  public class Delimiter extends Token {      public String lexme = "";        public Delimiter(String s, int t) {          super(t);          this.lexme = s;          this.name = "界符";      }        public String toString() {          return this.lexme;      }        public static final Delimiter              lpar = new Delimiter("(", Tag.LPAR),              rpar = new Delimiter(")", Tag.RPAR),              sem = new Delimiter(";", Tag.SEM);  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /*   * 常数   */  public class Num extends Token {      public final int value;        public Num(int v) {          super(Tag.CONSTANT);          this.value = v;          this.name = "常数";      }        public String toString() {          return "" + value;      }  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /**   * 行尾符   */  public class LineEnd extends Token {      public String lexme = "";        public LineEnd(String s) {          super(Tag.LINE_END);          this.lexme = s;          this.name = "行尾符";      }        public String toString() {          return this.lexme;      }        public static final LineEnd lineEnd = new LineEnd("\r\n");  }</code></pre>    <pre>  <code class="language-java">package Yue.LexicalAnalyzer;    /**   * 结尾符   */  public class AllEnd extends Token {      public String lexme = "";        public AllEnd(String s) {          super(Tag.ALL_END);          this.lexme = s;          this.name = "结尾符";      }        public String toString() {          return this.lexme;      }        public static final AllEnd allEnd = new AllEnd("#");  }</code></pre>    <p> </p>    <p>来自:http://www.jianshu.com/p/209f1fb6a827</p>    <p> </p>