我用QT寫了一個C語言詞法分析器
本人的分享均來自于實際設計過程中的感悟
不能保證分享成果的正確性,如有錯誤,請各路大神指出,我會虛心學習,感謝!!!
![]()
有時候我很好奇,編譯器是如何知道我們輸入的代碼是什么意思的,他是如何把我們的代碼編譯成二進制可執行文件的呢。今天我們來實現編譯器的第一步,一個非常簡單的C語言詞法分析器。
用于測試的代碼文件,hello.c代碼如下:
uint a=2147483649,b=321;
double c=111.1;
string str="ABC123\n";
int main(int aa,int bb)
{
int x=0,y=3;
a++;
a--;
if(a!=b)
{
a=1;
}
else
{
a=2;
}
printf("ABC %d '\" \\ 123\r\n",a);
}
int add(int a1,int a2)
{
return a1+a2;
}
詞法分析器的代碼如下lexer.h:
#ifndef LEXER_H
#define LEXER_H
#include <QString>
#include <QObject>
#include <QList>
#include <QMap>
#include <QDebug>
#include <QMetaEnum>
//單詞類型
enum TokenType
{
ID,//關鍵詞 函數 全局變量 關鍵字 系統函數
NUM, //數字
STRING, //字符串
OP//操作符分割符
};
//單詞屬性
class Token
{
public:
QStringList TokenType_str={"ID","NUM","STRING","OP"};
QString word;//單詞內容
TokenType type;//單詞類別
uint line;//單詞所在行
Token()
{
}
Token(QString word,TokenType type,uint line)
{
this->word=word;
this->type=type;
this->line=line;
}
void prt()
{
QString s="%1: %2 > %3 ";
s=s.arg(line,5).arg(TokenType_str[type],10).arg(word,10);
qDebug()<<s;
}
};
//詞法分析器
class Lexer
{
private:
QString codestr;
uint line;
public:
Lexer();
Lexer(QString code);
QList<Token> run(QString code="");//詞法解析
};
#endif // LEXER_H
lexer.cpp
#include "lexer.h"
Lexer::Lexer()
{
codestr="";
line=1;
}
Lexer::Lexer(QString code)
{
codestr=code;
line=1;
}
QList<Token> Lexer::run(QString code)
{
QList<Token> tokens;
if(code.length()>0)
{
codestr=code;
}
if(codestr.length()>0)
{
//開始解析
QByteArray local8Bit = codestr.toLocal8Bit();
char* p=local8Bit.data();//臨時指針
char tk=*p;
//遍歷字符串
while((tk=*p++)!='\0')
{
//tk代表當前字符 ntk*p代表后一個字符
if(tk=='\n'){line++;}//行數統計
else if(tk=='#'){while (*p != 0 && *p != '\n') ++p;}//忽略#關鍵字,不支持
else if ((tk >= 'a' && tk <= 'z') || (tk >= 'A' && tk <= 'Z') || tk == '_') {//解析ID
QString str(tk);
while ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_')
{
str.append(*p++);
}
Token token(str,TokenType::ID,line);
tokens.append(token);
}
else if (tk >= '0' && tk <= '9') {//解析數字
QString str(tk);
while ((*p >= '0' && *p <= '9') || (*p >= 'a' && *p <= 'f') || (*p >= 'A' && *p <= 'F')
||*p == 'x' || *p == 'X'
||*p == 'b' || *p == 'B'
||*p == '.' || *p == 'e'
)
{
str.append(*p++);
}
Token token(str,TokenType::NUM,line);
tokens.append(token);
}
else if (tk == '/') {
if (*p == '/') {//忽略注釋
++p;
while (*p != 0 && *p != '\n') ++p;
}
else if(*p=='*')//忽略多行注釋
{
++p;
while (*p!='\0'){
char c1=*p;
char c2=*(p+1);
if(c1=='*' && c2=='/')
{
++p;++p;
break;
}
++p;
}
}
else {
Token token("/",TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '\'' || tk == '"') {
QString str;
while (*p != 0 && *p != tk) {
if(*p=='\\')
{
char nc=*++p;//xia'yi'ge下一個字符
if(nc=='n' || nc=='r' || nc=='t' || nc =='\'' || nc=='\"' || nc=='\\')//轉義字符
{
if(nc=='n')str.append('\n');
if(nc=='r')str.append('\r');
if(nc=='t')str.append('\t');
if(nc=='\'')str.append('\'');
if(nc=='\"')str.append('\"');
if(nc=='\\')str.append('\\');
++p;
}
}
else
{
str.append(*p++);
}
}
++p;
if (tk == '"'){
Token token(str,TokenType::STRING,line);
tokens.append(token);
}
else
{
Token token(str,TokenType::NUM,line);
tokens.append(token);
}
}
else if (tk == '=' || tk == '+' || tk == '-' || tk == '|' || tk == '&')
{
char tk_next = *p;//檢查下個字符串是否相同
if (tk_next == tk)
{
p++;
QString str;
str.append(tk);
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
else
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '!' || tk == '>' || tk == '<')
{
char tk_next = *p;//檢查下個字符串是否相同
if (tk_next == '=')
{
p++;
QString str;
str.append(tk);
str.append(tk_next);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
else
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '~' || tk == ';' || tk == '{' || tk == '}' || tk == '(' || tk == ')' || tk == ']' || tk == ',' || tk == ':')
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
}
//打印單詞信息
foreach (Token t, tokens) {
t.prt();
}
qDebug()<<"=================================================";
return tokens;
}
在main.c文件中調用詞法分析器分析hello.c中的代碼:
#include <QCoreApplication>
#include <QFile>
#include <lexer.h>
#include <parser.h>
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
QFile file("./hello.c");
file.open(QFile::ReadOnly);
QByteArray localReadAll = file.readAll();
file.close();
QString code=QString::fromUtf8(localReadAll);
//開始詞法分析
Lexer lx(code);
QList<Token> tokens = lx.run();
return a.exec();
}
執行后的結果:
" 3: ID > uint "
" 3: ID > a "
" 3: OP > = "
" 3: NUM > 2147483649 "
" 3: OP > , "
" 3: ID > b "
" 3: OP > = "
" 3: NUM > 321 "
" 3: OP > ; "
" 4: ID > double "
" 4: ID > c "
" 4: OP > = "
" 4: NUM > 111.1 "
" 4: OP > ; "
" 5: ID > string "
" 5: ID > str "
" 5: OP > = "
" 5: STRING > ABC123\n "
" 5: OP > ; "
" 7: ID > int "
" 7: ID > main "
" 7: OP > ( "
" 7: ID > int "
" 7: ID > aa "
" 7: OP > , "
" 7: ID > int "
" 7: ID > bb "
" 7: OP > ) "
" 8: OP > { "
" 9: ID > int "
" 9: ID > x "
" 9: OP > = "
" 9: NUM > 0 "
" 9: OP > , "
" 9: ID > y "
" 9: OP > = "
" 9: NUM > 3 "
" 9: OP > ; "
" 10: ID > a "
" 10: OP > ++ "
" 10: OP > ; "
" 11: ID > a "
" 11: OP > -- "
" 11: OP > ; "
" 12: ID > if "
" 12: OP > ( "
" 12: ID > a "
" 12: OP > != "
" 12: ID > b "
" 12: OP > ) "
" 13: OP > { "
" 14: ID > a "
" 14: OP > = "
" 14: NUM > 1 "
" 14: OP > ; "
" 15: OP > } "
" 16: ID > else "
" 17: OP > { "
" 18: ID > a "
" 18: OP > = "
" 18: NUM > 2 "
" 18: OP > ; "
" 19: OP > } "
" 20: ID > printf "
" 20: OP > ( "
" 20: STRING > ABC %d '\" \\ 123\r\n "
" 20: OP > , "
" 20: ID > a "
" 20: OP > ) "
" 20: OP > ; "
" 21: OP > } "
" 25: ID > int "
" 25: ID > add "
" 25: OP > ( "
" 25: ID > int "
" 25: ID > a1 "
" 25: OP > , "
" 25: ID > int "
" 25: ID > a2 "
" 25: OP > ) "
" 26: OP > { "
" 27: ID > return "
" 27: ID > a1 "
" 27: OP > + "
" 27: ID > a2 "
" 27: OP > ; "
" 28: OP > } "
=================================================
可以看出,詞法分析器,把代碼中的關鍵字,操作符,字符串,分割符等都分離出來了,當然這是一個超級簡單的詞法分析器,功能并不完善,只是為了讓我們了解編譯過程中,編譯器做的哪些工作。

浙公網安備 33010602011771號