1. 程式人生 > >哈工大編譯原理第一次實驗--詞法分析(Java版本)

哈工大編譯原理第一次實驗--詞法分析(Java版本)

1.在判斷空行的時候,java裡面用 line == "" 不好使,除錯發現進不去if,然後用line.equals("")就好使。

2.java標準化輸出,可以有:System.out.printf("%-10s\t<ERROR:識別符號重複!>\n",token);這種寫法!printf啊,但是可以不能輸出到檔案中。不過我們可以這麼寫:

output.write(String.format("%-10s\t<%s,-->",token,token));  

String.format 救了我們哦~~

3.輸出到檔案中怎麼換行呢?output.write("空行~\r\n"); 呵呵,win下是\r\n哦,linux下\n。。。

===================================================================================

如何解讀這個看起來很糟糕的基本沒啥註釋的程式碼呢?

1.看清楚結構,結構如下:

(1)讀入一行line,把line轉成char[] 的strLine陣列,然後每次處理一個字元ch(看紅色程式碼,所有的處理都在for裡面)。

        (2)然後對每個ch進行分類:if else if else if 。。。建議每次看一個if{}就不會頭暈啦

2.看清楚演算法,這個是基於很精巧的“狀態轉移圖”的程式,我拿個數字處理的程式碼講解下:


那麼我們就建立個二維陣列來實現這個狀態的轉移:

   123456

 1 d.#e##

 2 ##d###

 3 ##de##

 4 ####-d

 5 #####d

 6 #####d

我們忽略0狀態,因為我們已經進入了。

狀態1到狀態1有向量連線,所以陣列d[1][1] = 'd'

狀態1到狀態2有向量連線,所以陣列d[1][2] = '.'

依次類推,沒有向量的就標為'#',然後關鍵程式碼如下:

int s = 1;
Boolean isfloat = false;
while (ch != '\0'&& (isDigit(ch) || ch == '.' || ch == 'e' || ch == '-')) {
	if (ch == '.' || ch == 'e')
		isfloat = true;

	int k;
	for (k = 1; k <= 6; k++) {
	    char tmpstr[] = digitDFA[s].toCharArray();
	    if (ch != '#'&& 1 == in_digitDFA(ch, tmpstr[k])) {
		token += ch;
		s = k;
		break;
	    }
        }
        if (k > 6)
	    break;
	ch = strLine[++i];
}
當迴圈退出的時候(k為6),然後s是狀態,當狀態為 1 , 3 ,6 的時候是正常退出

為 2 ,4 ,5的時候是有錯誤地退出。

=====================================================================

我的code.txt:

int a="a;
main()
{
int b =99A1;
int a= 999;
int c='a';
int abc = "hahah";
/*你妹啊*/
//你好啊

	print("Hello World!\n");//你又好了
	return 0;/*你妹啊*/
}


我的輸出:

line : 1
int       	<int,-->
a         	<識別符號,(a,入口:0)>
=         	<=,-->
"a        	ERROR:字串常量引號不封閉
;         	<;,-->




line : 2
main      	<識別符號,(main,入口:1)>
(         	<(,-->
)         	<),-->




line : 3
{         	<{,-->




line : 4
int       	<int,-->
b         	<識別符號,(b,入口:2)>
=         	<=,-->
99A1      	ERROR:請確保實常數輸入正確
;         	<;,-->




line : 5
int       	<int,-->
a         	<ERROR:識別符號重複!>
=         	<=,-->
999       	<實型常量,999>
;         	<;,-->




line : 6
int       	<int,-->
c         	<識別符號,(c,入口:3)>
=         	<=,-->
'a'       	<字元常量,a>
;         	<;,-->




line : 7
int       	<int,-->
abc       	<識別符號,(abc,入口:4)>
=         	<=,-->
"hahah"   	<字串常量,hahah>
;         	<;,-->




line : 8
/*你妹啊*/   	(註釋:/*你妹啊*/)




line : 9
//你好啊     	(註釋://你好啊)




line : 10
空行~




line : 11
print     	<識別符號,(print,入口:5)>
(         	<(,-->
"Hello World!\n"	<字串常量,Hello World!\n>
)         	<),-->
;         	<;,-->
//你又好了    	(註釋://你又好了)




line : 12
return    	<return,-->
0         	<實型常量,0>
;         	<;,-->
/*你妹啊*/   	(註釋:/*你妹啊*/)




line : 13
}         	<},-->
這是我那個很糟糕的原始碼:
package ouyang;

import java.io.*;
import java.util.*;

public class AnalysisCodeToWord {
	public static void main(String args[]) {
		String infile = "code.txt";
		String outfile = "out.txt";
		try {
			FileInputStream f = new FileInputStream(infile);
			BufferedReader dr = new BufferedReader(new InputStreamReader(f));

			BufferedWriter output = new BufferedWriter(new FileWriter(outfile));

			String line = "";
			int cnt = 0;
			while ((line = dr.readLine()) != null) {
				cnt++;
				if (cnt == 1) {
					System.out.println("line : " + cnt);
					output.write(String.format("line : %d\r\n", cnt));
				} else {
					System.out.println("\n\nline : " + cnt);
					output.write(String.format("\r\n\r\nline : %d\r\n", cnt));
				}
				if (line.equals("")) {
					System.out.println("空行~");
					output.write("空行~\r\n");
				} else {
					
					char[] strLine = line.toCharArray();
					
					for (int i = 0; i < strLine.length; i++) {
						char ch = strLine[i];
						String token = "";

						if (isAlpha(ch)) // 判斷關鍵字和識別符號
						{
							do {
								token += ch;
								i++;
								if(i>=strLine.length) break;
								ch = strLine[i];
							} while (ch != '\0' && (isAlpha(ch) || isDigit(ch)));

							--i; // 指標回退

							if (isMatchKeyword(token.toString())) // 是關鍵字
							{
								System.out.printf("%-10s\t<%s,-->\n", token,
										token);
								output.write(String.format(
										"%-10s\t<%s,-->\r\n", token, token));
							} else // 是識別符號
							{
								if (symbol.isEmpty()
										|| (!symbol.isEmpty() && !symbol
												.containsKey(token))) {
									symbol.put(token, symbol_pos);

									System.out.printf(
											"%-10s\t<識別符號,(%s,入口:%d)>\n", token,
											token, symbol_pos);
									output.write(String.format(
											"%-10s\t<識別符號,(%s,入口:%d)>\r\n",
											token, token, symbol_pos));
									symbol_pos++;
								} else {
									System.out.printf(
											"%-10s\t<ERROR:識別符號重複!>\n", token);
									output
											.write(String
													.format(
															"%-10s\t<ERROR:識別符號重複!>\r\n",
															token));
								}
							}
							token = "";
						} else if (isDigit(ch)) // 判斷數字常量
						{
							int s = 1;
							Boolean isfloat = false;
							while (ch != '\0'
									&& (isDigit(ch) || ch == '.' || ch == 'e' || ch == '-')) {
								if (ch == '.' || ch == 'e')
									isfloat = true;

								int k;
								for (k = 1; k <= 6; k++) {
									char tmpstr[] = digitDFA[s].toCharArray();
									if (ch != '#'
											&& 1 == in_digitDFA(ch, tmpstr[k])) {
										token += ch;
										s = k;
										break;
									}
								}
								if (k > 6)
									break;
								i++;if(i>=strLine.length) break;
								ch = strLine[i];
							}
							// if(ch) --i; // 指標回退
							Boolean haveMistake = false;

							if (s == 2 || s == 4 || s == 5) {
								haveMistake = true;
							} else // 1,3,6
							{
								if (!isOp(ch) || ch == '.')
									haveMistake = true;
							}

							if (haveMistake) // 錯誤處理
							{
								while (ch != '\0' && ch != ',' && ch != ';'
										&& ch != ' ') // 一直到“可分割”的字元結束
								{
									token += ch;
									i++;if(i>=strLine.length) break;
									ch = strLine[i];
								}
								System.out.printf("%-10s\tERROR:請確保實常數輸入正確\n",
										token);
								output.write(String.format(
										"%-10s\tERROR:請確保實常數輸入正確!\r\n", token));
							} else {
								if (isfloat) {
									System.out.printf("%-10s\t<實型常量,%s>\n",
											token, token);
									output.write(String.format(
											"%-10s\t<實型常量,%s>\r\n", token,
											token));
								} else {
									System.out.printf("%-10s\t<實型常量,%s>\n",
											token, token);
									output.write(String.format(
											"%-10s\t<整型常量,%s>\r\n", token,
											token));
								}
							}
							--i;
							token = "";
						} else if (ch == '\'') // 識別字符常量,類似處理字串常量。
						{
							int s = 0;
							Boolean haveMistake = false;
							String token1 = "";
							token1 += ch;
							while (s != 3) {
								i++;if(i>=strLine.length) break;
								ch = strLine[i];
								if (ch == '\0') {
									haveMistake = true;
									break;
								}
								for (int k = 0; k < 4; k++) {
									char tmpstr[] = stConDFA[s].toCharArray();
									if (in_sinStConDFA(ch, tmpstr[k])) {
										token1 += ch; // 為輸出
										if (k == 2 && s == 1) {
											if (isEsSt(ch)) // 是轉義字元
												token = token + '\\' + ch;
											else
												token += ch;
										} else if (k != 3 && k != 1)
											token += ch;
										s = k;
										break;
									}
								}
							}
							if (haveMistake) {
								System.out.printf("%s\tERROR:字元常量引號不封閉\n",
										token1);
								output.write(String.format(
										"%s\tERROR:字元常量引號不封閉\r\n", token1));
								--i;
							} else {
								if (token.length() == 1) {
									System.out.printf("%-10s\t<字元常量,%s>\n",
											token1, token);
									output.write(String.format(
											"%-10s\t<字元常量,%s>\r\n", token1,
											token));
								} else if (token.length() == 2) {
									if (isEsSt(token.charAt(1))
											&& token.charAt(0) == '\\') {
										System.out.printf("%-10s\t<字元常量,%s>\n",
												token1, token);
										output.write(String.format(
												"%-10s\t<字元常量,%s>\r\n", token1,
												token));
									}
								}
							}
							token = "";
						} else if (ch == '"') // 處理字串常量的
						{
							String token1 = "";
							token1 += ch;

							int s = 0;
							Boolean haveMistake = false;
							while (s != 3 ) {
								i++;
								if(i>=strLine.length-1) 
								{
									haveMistake = true;
									break;
								}
								
								ch = strLine[i];
								if (ch == '\0') {
									haveMistake = true;
									break;
								}
								for (int k = 0; k < 4; k++) {
									char tmpstr[] = stConDFA[s].toCharArray();
									if (in_stConDFA(ch, tmpstr[k])) {
										token1 += ch;
										if (k == 2 && s == 1) {
											if (isEsSt(ch)) // 是轉義字元
												token = token + '\\' + ch;
											else
												token += ch;
										} else if (k != 3 && k != 1)
											token += ch;
										s = k;
										break;
									}
								}
							}
							if (haveMistake) {
								System.out.printf("%-10s\tERROR:字串常量引號不封閉\n",
										token1);
								output.write(String.format(
										"%-10s\tERROR:字串常量引號不封閉\n", token1));
								--i;
							} else {
								System.out.printf("%-10s\t<字串常量,%s>\n",
										token1, token);
								output
										.write(String.format(
												"%-10s\t<字串常量,%s>\r\n",
												token1, token));
							}
							token = "";
						} else if (isOp(ch)) // 運算子,界符
						{
							token += ch;
							if (isPlusEqu(ch)) // 後面可以用一個"="
							{
								i++;if(i>=strLine.length) break;
								ch = strLine[i];
								if (ch == '=')
									token += ch;
								else {
									if (isPlusSame(strLine[i - 1])
											&& ch == strLine[i - 1])
										token += ch; // 後面可以用一個和自己一樣的
									else {
										--i;
									}
								}
							}
							System.out.printf("%-10s\t<%s,-->\n", token, token);
							output.write(String.format("%-10s\t<%s,-->\r\n",
									token, token));
							token = "";
						} else if (ch == '/') // 註釋+除號: 註釋只要識別出來就好。
						{
							token += ch;
							i++;if(i>=strLine.length) break;
							ch = strLine[i];

							if (ch != '*' && ch != '/') // 除號處理
							{
								if (ch == '=')
									token += ch; // /=
								else {
									--i; // 指標回退 // /
								}
								System.out.printf("%-10s\t<%s,-->\n", token,
										token);
								output.write(String.format("%-10s\t<%s,-->\n",
										token, token));
								token = "";
							} else // 註釋可能是‘//’也可能是‘/*’
							{
								Boolean haveMistake = false;
								if (ch == '*') {
									token += ch; // ch == '*'
									int s = 2;

									while (s != 4) {
										i++;if(i>=strLine.length) break;
										ch = strLine[i]; // 注意判斷溢位!
										if (ch == '\0') {
											haveMistake = true;
											break;
										}
										for (int k = 2; k <= 4; k++) {
											char tmpstr[] = noteDFA[s]
													.toCharArray();
											if (1 == in_noteDFA(ch, tmpstr[k],
													s)) {
												token += ch;
												s = k;
												break;
											}
										}
									}
								}
								else if(ch == '/') //這裡就不用狀態轉移了...
								{
									int index = line.lastIndexOf("//");
									
									String tmpstr=line.substring(index);
									int tmpint = tmpstr.length();
									for(int k=0;k<tmpint;k++) 
									{
										i++;
									}
									token = tmpstr;
								}
								System.out.printf("%-10s\t", token);
								output.write(String.format("%-10s\t", token));
								if (haveMistake) {
									System.out.printf("ERROR:註釋沒有封閉\n");
									output.write("ERROR:註釋沒有封閉\r\n");
									--i;
								} else {
									System.out.printf("(註釋:%s)\n", token);
									output.write(String.format("(註釋:%s)\n",
											token));
								}

								token = "";
							}
						}
					    else // 一些很奇怪的字元
			            {
			                if(ch != ' ' && ch != '\t')
			                {
			                	System.out.printf("%-10c ERROR:存在不合法字元\n",ch);
			                	output.write(String.format("%-10c ERROR:存在不合法字元\n",ch));
			                }
			            }
					}
				}

			}

			f.close();
			dr.close();
			output.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

	}


	public static Boolean isAlpha(char ch) {
		return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_');
	}

	public static Boolean isDigit(char ch) {
		return (ch >= '0' && ch <= '9');
	}

	public static Boolean isMatchKeyword(String str) {
		Boolean flag = false;
		for (int i = 0; i < 32; i++) {
			if (str.equals(keywords[i])) {
				flag = true;
				break;
			}
		}
		return flag;
	}

	public static Boolean isOp(char ch) // 判斷是否是運算子
	{
		for (int i = 0; i < 22; i++)
			if (ch == oper[i]) {
				return true;
			}
		return false;
	}

	public static int in_digitDFA(char ch, char dD) {
		if (dD == 'd') {
			if (isDigit(ch))
				return 1;
			else
				return 0;
		}
		return (ch == dD) ? 1 : 0;
	}

	public static Boolean in_stConDFA(char ch, char key) {
		if (key == 'a')
			return true;
		if (key == '\\')
			return ch == key;
		if (key == '"')
			return ch == key;
		if (key == 'd')
			return ch != '\\' && ch != '"';
		return false;
	}

	public static Boolean in_sinStConDFA(char ch, char key) {
		if (key == 'a')
			return true;
		if (key == '\\')
			return ch == key;
		if (key == '"')
			return ch == '\'';
		if (key == 'd')
			return ch != '\\' && ch != '\'';
		return false;
	}

	public static Boolean isPlusEqu(char ch) // 運算子後可加等於
	{
		return ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == '='
				|| ch == '>' || ch == '<' || ch == '&' || ch == '|'
				|| ch == '^';
	}

	public static Boolean isPlusSame(char ch) // 可以連續兩個運算子一樣
	{
		return ch == '+' || ch == '-' || ch == '&' || ch == '|';
	}

	public static Boolean isEsSt(char ch) {
		return ch == 'a' || ch == 'b' || ch == 'f' || ch == 'n' || ch == 'r'
				|| ch == 't' || ch == 'v' || ch == '?' || ch == '0';
	}

	public static int in_noteDFA(char ch, char nD, int s) {
		if (s == 2) {
			if (nD == 'c') {
				if (ch != '*')
					return 1;
				else
					return 0;
			}
		}
		if (s == 3) {
			if (nD == 'c') {
				if (ch != '*' && ch != '/')
					return 1;
				else
					return 0;
			}
		}
		return (ch == nD) ? 1 : 0;
	}

	public static String code = "";

	public static Map<String, Integer> symbol = new HashMap<String, Integer>();// =new
																				// HashMap<String,int>;

	public static int symbol_pos = 0;

	// 32個
	public static String keywords[] = { "auto", "double", "int", "struct",
			"break", "else", "long", "switch", "case", "enum", "register",
			"typedef", "char", "extern", "return", "union", "const", "float",
			"short", "unsigned", "continue", "for", "signed", "void",
			"default", "goto", "sizeof", "volatile", "do", "if", "while",
			"static" };

	// 7個
	public static String digitDFA[] = { "#", "#d.#e##", "###d###", "###de##",
			"#####-d", "######d", "######d" };

	// 22個
	public static char oper[] = { '+', '-', '*', '=', '<', '>', '&', '|', '~',
			'^', '!', '(', ')', '[', ']', '{', '}', '%', ';', ',', '#', '.' };

	// 4個
	public static String stConDFA[] = { "#\\d#", "##a#", "#\\d\"", "####" };

	// 4個
	public static String noteDFA[] = { "#", "##*##", "##c*#", "##c*/", "#####" };

}