手撸个python目标还是太遥远,先从简单的json解析器开始。捣鼓了半天,似乎是ok了,现总结一下。
1.Lexer 分词器
分词器的目标是将输入的String token化,定义了以下Token种类
-
public enum FFToken {
-
Comma,Colon,LBracket,RBracket,LBrace,RBrace,Str,Number,EOF;
-
}
分词器的实现核心便是一个状态机,当一个token被识别时,便将识别出的token返回给上层调用者:parser。
分词器需要一个peek功能,即偷瞄一下下一个token,这里面我采取的是回退机制,即token获取后,parser可以调用lexer的pushBackToken方法,将token退回去;这样下次getToken时,便还是返回这个token。
分词器的代码如下:
-
public class FFLexer {
-
private String str;
-
private int pos_start;
-
private int pos_end;
-
private int last_pos_start;
-
private int last_pos_end;
-
private State state;
-
private boolean bEscape;
-
private boolean dotted;
-
private String value;
-
enum State{
-
Start,
-
InNumber,
-
InStr
-
}
-
public FFLexer(String str){
-
this.str = str;
-
this.pos_start = 0;
-
this.pos_end = 0;
-
this.last_pos_start = 0;
-
this.last_pos_end = 0;
-
this.state = State.Start;
-
this.bEscape = false;
-
this.dotted = false;
-
}
-
public FFToken getToken() throws FFParseException{
-
this.last_pos_start = this.pos_start;
-
this.last_pos_end = this.pos_end;
-
FFToken token = null;
-
while(token == null){
-
if(this.pos_start == this.str.length()){
-
return FFToken.EOF;
-
}
-
this.pos_end ++;
-
boolean ended = false;
-
if(this.pos_end > this.str.length()){
-
ended = true;
-
}
-
char c = ”;
-
if(!ended){
-
c = endChar();
-
}
-
switch (this.state){
-
case Start:
-
if(c == ‘{‘){
-
token = FFToken.LBrace;
-
}else if(c == ‘}’){
-
token = FFToken.RBrace;
-
}else if(c == ‘[‘){
-
token = FFToken.LBracket;
-
}else if(c == ‘]’){
-
token = FFToken.RBracket;
-
}else if(c == ‘,’){
-
token = FFToken.Comma;
-
}else if(c == ‘:’){
-
token = FFToken.Colon;
-
}else if(c == ‘”‘){
-
state = State.InStr;
-
this.pos_start += 1;
-
}else if(Character.isDigit(c)){
-
state = State.InNumber;
-
}else if(c == ‘ ‘|| c==’\t’||c==’\n’||c==’\r’){
-
this.pos_start += 1;
-
}else{
-
throw new FFParseException();
-
}
-
break;
-
case InStr:
-
if(ended){
-
throw new FFParseException();
-
}
-
if(c == ‘”‘){
-
if(!bEscape){
-
value = this.str.substring(this.pos_start,this.pos_end – 1);
-
token = FFToken.Str;
-
this.state = State.Start;
-
}else{
-
this.bEscape = false;
-
}
-
}else{
-
if(c == ‘\\’){
-
this.bEscape = true;
-
}
-
}
-
break;
-
case InNumber:
-
if(ended){
-
throw new FFParseException();
-
}
-
if(Character.isDigit(c)){
-
}else if(endChar() == ‘.’){
-
if(this.dotted){
-
value = this.str.substring(this.pos_start,this.pos_end – 1);
-
token = FFToken.Number;
-
this.dotted = false;
-
this.state = State.Start;
-
this.pos_end –;
-
}else{
-
this.dotted = true;
-
}
-
}else{
-
value = this.str.substring(this.pos_start,this.pos_end – 1);
-
token = FFToken.Number;
-
this.dotted = false;
-
this.state = State.Start;
-
this.pos_end –;
-
}
-
break;
-
}
-
}
-
this.pos_start = this.pos_end;
-
return token;
-
}
-
public void pushBackToken(){
-
this.pos_start = this.last_pos_start;
-
this.pos_end = this.last_pos_end;
-
}
-
public String processedStr(){
-
return this.str.substring(0,this.pos_end );
-
}
-
private char endChar(){
-
return this.str.charAt(this.pos_end – 1);
-
}
-
public String getValue(){
-
return this.value;
-
}
-
}
可见dirty work是挺多的,调了我好久才总算看上去灵光。
2.解析器 parser
parser的实现核心便是函数递归调用,如要解析一个JsonObject,那么
我们首先需要获得一个{,
1)如果下一个token不是},
如果不是第一次匹配,需匹配一个逗号,
匹配一个key,再匹配一个:,再匹配一个value
回到1)处继续判断
否则,匹配一个}
JsonArray的匹配思路也是一样的。
当然,光解析不够,我们还得将转换后的结果保存、返回,这很类似编译器里面的代码生成。
-
public class FFParser {
-
private FFLexer ffLexer;
-
public FFParser(String str){
-
ffLexer = new FFLexer(str);
-
}
-
public FFJsonObject parseJsonObject() throws FFParseException{
-
return jsonObject();
-
}
-
public FFJsonArray parseJsonArray() throws FFParseException{
-
return jsonArray();
-
}
-
private FFJsonObject jsonObject() throws FFParseException{
-
FFJsonObject ffJsonObject = new FFJsonObject();
-
match(FFToken.LBrace);
-
FFToken token;
-
token = ffLexer.getToken();
-
ffLexer.pushBackToken();
-
if(token != FFToken.RBrace){
-
Object key = key();
-
match(FFToken.Colon);
-
Object value = value();
-
ffJsonObject.put(key,value);
-
}
-
do{
-
token = ffLexer.getToken();
-
if(token == FFToken.Comma){
-
token = ffLexer.getToken();
-
ffLexer.pushBackToken();
-
if(token != FFToken.RBrace ){
-
Object key = key();
-
match(FFToken.Colon);
-
Object value = value();
-
ffJsonObject.put(key,value);
-
token = ffLexer.getToken();
-
ffLexer.pushBackToken();
-
}
-
}else{
-
ffLexer.pushBackToken();
-
}
-
}while (token != FFToken.RBrace && token != FFToken.EOF);
-
match(FFToken.RBrace);
-
return ffJsonObject;
-
}
-
private FFJsonArray jsonArray() throws FFParseException{
-
FFJsonArray ffJsonArray = new FFJsonArray();
-
match(FFToken.LBracket);
-
FFToken token;
-
token = ffLexer.getToken();
-
ffLexer.pushBackToken();
-
if(token != FFToken.RBracket){
-
Object value = value();
-
ffJsonArray.add(value);
-
}
-
do{
-
token = ffLexer.getToken();
-
if(token == FFToken.Comma){
-
token = ffLexer.getToken();
-
ffLexer.pushBackToken();
-
if(token != FFToken.RBracket ) {
-
Object value = value();
-
ffJsonArray.add(value);
-
token = ffLexer.getToken();
-
ffLexer.pushBackToken();
-
}
-
}else{
-
ffLexer.pushBackToken();
-
}
-
}while (token != FFToken.RBracket && token != FFToken.EOF);
-
match(FFToken.RBracket);
-
return ffJsonArray;
-
}
-
private Object key() throws FFParseException{
-
FFToken token;
-
token = ffLexer.getToken();
-
if(token == FFToken.Str){
-
return ffLexer.getValue();
-
}else if(token == FFToken.Number){
-
return parseNumber(ffLexer.getValue());
-
}else{
-
System.err.println(token);
-
throw new FFParseException();
-
}
-
}
-
private Object value() throws FFParseException{
-
FFToken token;
-
token = ffLexer.getToken();
-
if(token == FFToken.Str){
-
return ffLexer.getValue();
-
}else if(token == FFToken.Number){
-
return parseNumber(ffLexer.getValue());
-
}else if(token == FFToken.LBrace){
-
ffLexer.pushBackToken();
-
return jsonObject();
-
}else if(token == FFToken.LBracket){
-
ffLexer.pushBackToken();;
-
return jsonArray();
-
}else{
-
throw new FFParseException();
-
}
-
}
-
private void match(FFToken expectedToken) throws FFParseException{
-
FFToken token = ffLexer.getToken();
-
if(token != expectedToken){
-
System.err.println(ffLexer.processedStr());
-
System.err.println(token);
-
System.err.println(expectedToken);
-
throw new FFParseException();
-
}
-
}
-
private Object parseNumber(String value){
-
if(value.contains(“.”)){
-
return Double.parseDouble(value);
-
}else{
-
return Long.parseLong(value);
-
}
-
}
-
}
最后的工作便是搞个门面类FFJson供外界调用了。代码就不贴在这儿了。
下面是测试结果:
-
public class App
-
{
-
public static void main( String[] args )
-
{
-
System.out.println(FFJson.parseObject(“{\”abc\”:123,\”dddd\”:[1,3,3],\”xxx\”:{\”a\”:1,\”b\”:\”a\”}}”));
-
System.out.println(FFJson.parseArray(“[1,2,3,4,5]”));
-
String str = “{\n” +
-
” \”code\”: 0,\n” +
-
” \”msg\”: \”成功\”,\n” +
-
” \”data\”: [\n” +
-
” {\n” +
-
” \”name\”: \”热榜\”,\n” +
-
” \”type\”: 1,\n” +
-
” \”foods\”: [\n” +
-
” {\n” +
-
” \”id\”: \”123456\”,\n” +
-
” \”name\”: \”皮蛋粥\”,\n” +
-
” \”price\”: 1.2,\n” +
-
” \”description\”: \”好吃的皮蛋粥\”,\n” +
-
” \”icon\”: \”http://xxx.com\”,\n” +
-
” }\n” +
-
” ]\n” +
-
” },\n” +
-
” {\n” +
-
” \”name\”: \”好吃的\”,\n” +
-
” \”type\”: 2,\n” +
-
” \”foods\”: [\n” +
-
” {\n” +
-
” \”id\”: \”123457\”,\n” +
-
” \”name\”: \”慕斯蛋糕\”,\n” +
-
” \”price\”: 10.9,\n” +
-
” \”description\”: \”美味爽口\”,\n” +
-
” \”icon\”: \”http://xxx.com\”,\n” +
-
” }\n” +
-
” ]\n” +
-
” }\n” +
-
” ]\n” +
-
“}”;
-
FFJsonObject ffJsonObject = FFJson.parseObject(str);
-
System.out.println(ffJsonObject);
-
System.out.println(ffJsonObject.getJsonArray(“data”).getJsonObject(0).getJsonArray(“foods”).getJsonObject(0).getString(“description”));
-
}
-
}
其输出为:
-
{“abc”:123,”xxx”:{“a”:1,”b”:”a”},”dddd”:[1,3,3]}
-
[1,2,3,4,5]
-
{“msg”:”成功”,”code”:0,”data”:[{“foods”:[{“price”:1.2,”name”:”皮蛋粥”,”icon”:”http://xxx.com”,”description”:”好吃的皮蛋粥”,”id”:”123456″}],”name”:”热榜”,”type”:1},{“foods”:[{“price”:10.9,”name”:”慕斯蛋糕”,”icon”:”http://xxx.com”,”description”:”美味爽口”,”id”:”123457″}],”name”:”好吃的”,”type”:2}]}
-
好吃的皮蛋粥
okay,尤其那一长串的链式调用非常的有feel。
题外话:抛去隐藏bug、性能问题不谈,功能性上,一个完整的json类库还缺少与实体类的转换,不过这个不属于编译原理的范畴了,也就不造轮子了,或者等下次有闲情继续完善。
转载请注明:XAMPP中文组官网 » 编译原理练手之撸个Json Parser