Class | RubyLex |
In: |
parsers/parse_rb.rb
|
Parent: | Object |
Lexical analyzer for Ruby source
ENINDENT_CLAUSE | = | [ "case", "class", "def", "do", "for", "if", "module", "unless", "until", "while", "begin" |
DEINDENT_CLAUSE | = | ["end" |
PERCENT_LTYPE | = | { "q" => "\'", "Q" => "\"", "x" => "\`", "r" => "/", "w" => "]" |
PERCENT_PAREN | = | { "{" => "}", "[" => "]", "<" => ">", "(" => ")" |
Ltype2Token | = | { "\'" => TkSTRING, "\"" => TkSTRING, "\`" => TkXSTRING, "/" => TkREGEXP, "]" => TkDSTRING |
DLtype2Token | = | { "\"" => TkDSTRING, "\`" => TkDXSTRING, "/" => TkDREGEXP, } |
continue | [R] | include IRB # 1.8.2 doesn‘t support IRB::SLex |
exception_on_syntax_error | [RW] | |
indent | [R] | |
lex_state | [R] | |
read_auto_clean_up | [RW] | |
skip_space | [RW] |
# File parsers/parse_rb.rb, line 447 447: def initialize(content) 448: lex_init 449: 450: @reader = BufferedReader.new(content) 451: 452: @exp_line_no = @line_no = 1 453: @base_char_no = 0 454: @indent = 0 455: 456: @ltype = nil 457: @quoted = nil 458: @lex_state = EXPR_BEG 459: @space_seen = false 460: 461: @continue = false 462: @line = "" 463: 464: @skip_space = false 465: @read_auto_clean_up = false 466: @exception_on_syntax_error = true 467: end
# File parsers/parse_rb.rb, line 495 495: def gets 496: c = getc or return 497: l = "" 498: begin 499: l.concat c unless c == "\r" 500: break if c == "\n" 501: end while c = getc 502: l 503: end
# File parsers/parse_rb.rb, line 1269 1269: def identify_comment 1270: @ltype = "#" 1271: comment = "#" 1272: while ch = getc 1273: if ch == "\\" 1274: ch = getc 1275: if ch == "\n" 1276: ch = " " 1277: else 1278: comment << "\\" 1279: end 1280: else 1281: if ch == "\n" 1282: @ltype = nil 1283: ungetc 1284: break 1285: end 1286: end 1287: comment << ch 1288: end 1289: return Token(TkCOMMENT).set_text(comment) 1290: end
# File parsers/parse_rb.rb, line 964 964: def identify_gvar 965: @lex_state = EXPR_END 966: str = "$" 967: 968: tk = case ch = getc 969: when /[~_*$?!@\/\\;,=:<>".]/ #" 970: str << ch 971: Token(TkGVAR, str) 972: 973: when "-" 974: str << "-" << getc 975: Token(TkGVAR, str) 976: 977: when "&", "`", "'", "+" 978: str << ch 979: Token(TkBACK_REF, str) 980: 981: when /[1-9]/ 982: str << ch 983: while (ch = getc) =~ /[0-9]/ 984: str << ch 985: end 986: ungetc 987: Token(TkNTH_REF) 988: when /\w/ 989: ungetc 990: ungetc 991: return identify_identifier 992: else 993: ungetc 994: Token("$") 995: end 996: tk.set_text(str) 997: end
# File parsers/parse_rb.rb, line 1074 1074: def identify_here_document 1075: ch = getc 1076: if ch == "-" 1077: ch = getc 1078: indent = true 1079: end 1080: if /['"`]/ =~ ch # ' 1081: lt = ch 1082: quoted = "" 1083: while (c = getc) && c != lt 1084: quoted.concat c 1085: end 1086: else 1087: lt = '"' 1088: quoted = ch.dup 1089: while (c = getc) && c =~ /\w/ 1090: quoted.concat c 1091: end 1092: ungetc 1093: end 1094: 1095: ltback, @ltype = @ltype, lt 1096: reserve = "" 1097: 1098: while ch = getc 1099: reserve << ch 1100: if ch == "\\" #" 1101: ch = getc 1102: reserve << ch 1103: elsif ch == "\n" 1104: break 1105: end 1106: end 1107: 1108: str = "" 1109: while (l = gets) 1110: l.chomp! 1111: l.strip! if indent 1112: break if l == quoted 1113: str << l.chomp << "\n" 1114: end 1115: 1116: @reader.divert_read_from(reserve) 1117: 1118: @ltype = ltback 1119: @lex_state = EXPR_END 1120: Token(Ltype2Token[lt], str).set_text(str.dump) 1121: end
# File parsers/parse_rb.rb, line 999 999: def identify_identifier 1000: token = "" 1001: token.concat getc if peek(0) =~ /[$@]/ 1002: token.concat getc if peek(0) == "@" 1003: 1004: while (ch = getc) =~ /\w|_/ 1005: print ":", ch, ":" if RubyLex.debug? 1006: token.concat ch 1007: end 1008: ungetc 1009: 1010: if ch == "!" or ch == "?" 1011: token.concat getc 1012: end 1013: # fix token 1014: 1015: # $stderr.puts "identifier - #{token}, state = #@lex_state" 1016: 1017: case token 1018: when /^\$/ 1019: return Token(TkGVAR, token).set_text(token) 1020: when /^\@/ 1021: @lex_state = EXPR_END 1022: return Token(TkIVAR, token).set_text(token) 1023: end 1024: 1025: if @lex_state != EXPR_DOT 1026: print token, "\n" if RubyLex.debug? 1027: 1028: token_c, *trans = TkReading2Token[token] 1029: if token_c 1030: # reserved word? 1031: 1032: if (@lex_state != EXPR_BEG && 1033: @lex_state != EXPR_FNAME && 1034: trans[1]) 1035: # modifiers 1036: token_c = TkSymbol2Token[trans[1]] 1037: @lex_state = trans[0] 1038: else 1039: if @lex_state != EXPR_FNAME 1040: if ENINDENT_CLAUSE.include?(token) 1041: @indent += 1 1042: elsif DEINDENT_CLAUSE.include?(token) 1043: @indent -= 1 1044: end 1045: @lex_state = trans[0] 1046: else 1047: @lex_state = EXPR_END 1048: end 1049: end 1050: return Token(token_c, token).set_text(token) 1051: end 1052: end 1053: 1054: if @lex_state == EXPR_FNAME 1055: @lex_state = EXPR_END 1056: if peek(0) == '=' 1057: token.concat getc 1058: end 1059: elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT 1060: @lex_state = EXPR_ARG 1061: else 1062: @lex_state = EXPR_END 1063: end 1064: 1065: if token[0, 1] =~ /[A-Z]/ 1066: return Token(TkCONSTANT, token).set_text(token) 1067: elsif token[token.size - 1, 1] =~ /[!?]/ 1068: return Token(TkFID, token).set_text(token) 1069: else 1070: return Token(TkIDENTIFIER, token).set_text(token) 1071: end 1072: end
# File parsers/parse_rb.rb, line 1142 1142: def identify_number(start) 1143: str = start.dup 1144: 1145: if start == "+" or start == "-" or start == "" 1146: start = getc 1147: str << start 1148: end 1149: 1150: @lex_state = EXPR_END 1151: 1152: if start == "0" 1153: if peek(0) == "x" 1154: ch = getc 1155: str << ch 1156: match = /[0-9a-f_]/ 1157: else 1158: match = /[0-7_]/ 1159: end 1160: while ch = getc 1161: if ch !~ match 1162: ungetc 1163: break 1164: else 1165: str << ch 1166: end 1167: end 1168: return Token(TkINTEGER).set_text(str) 1169: end 1170: 1171: type = TkINTEGER 1172: allow_point = TRUE 1173: allow_e = TRUE 1174: while ch = getc 1175: case ch 1176: when /[0-9_]/ 1177: str << ch 1178: 1179: when allow_point && "." 1180: type = TkFLOAT 1181: if peek(0) !~ /[0-9]/ 1182: ungetc 1183: break 1184: end 1185: str << ch 1186: allow_point = false 1187: 1188: when allow_e && "e", allow_e && "E" 1189: str << ch 1190: type = TkFLOAT 1191: if peek(0) =~ /[+-]/ 1192: str << getc 1193: end 1194: allow_e = false 1195: allow_point = false 1196: else 1197: ungetc 1198: break 1199: end 1200: end 1201: Token(type).set_text(str) 1202: end
# File parsers/parse_rb.rb, line 1123 1123: def identify_quotation(initial_char) 1124: ch = getc 1125: if lt = PERCENT_LTYPE[ch] 1126: initial_char += ch 1127: ch = getc 1128: elsif ch =~ /\W/ 1129: lt = "\"" 1130: else 1131: RubyLex.fail SyntaxError, "unknown type of %string ('#{ch}')" 1132: end 1133: # if ch !~ /\W/ 1134: # ungetc 1135: # next 1136: # end 1137: #@ltype = lt 1138: @quoted = ch unless @quoted = PERCENT_PAREN[ch] 1139: identify_string(lt, @quoted, ch, initial_char) 1140: end
# File parsers/parse_rb.rb, line 1204 1204: def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil) 1205: @ltype = ltype 1206: @quoted = quoted 1207: subtype = nil 1208: 1209: str = "" 1210: str << initial_char if initial_char 1211: str << (opener||quoted) 1212: 1213: nest = 0 1214: begin 1215: while ch = getc 1216: str << ch 1217: if @quoted == ch 1218: if nest == 0 1219: break 1220: else 1221: nest -= 1 1222: end 1223: elsif opener == ch 1224: nest += 1 1225: elsif @ltype != "'" && @ltype != "]" and ch == "#" 1226: ch = getc 1227: if ch == "{" 1228: subtype = true 1229: str << ch << skip_inner_expression 1230: else 1231: ungetc(ch) 1232: end 1233: elsif ch == '\\' #' 1234: str << read_escape 1235: end 1236: end 1237: if @ltype == "/" 1238: if peek(0) =~ /i|o|n|e|s/ 1239: str << getc 1240: end 1241: end 1242: if subtype 1243: Token(DLtype2Token[ltype], str) 1244: else 1245: Token(Ltype2Token[ltype], str) 1246: end.set_text(str) 1247: ensure 1248: @ltype = nil 1249: @quoted = nil 1250: @lex_state = EXPR_END 1251: end 1252: end
# File parsers/parse_rb.rb, line 518 518: def lex 519: until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) && 520: !@continue or 521: tk.nil?) 522: end 523: line = get_read 524: 525: if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil? 526: nil 527: else 528: line 529: end 530: end
# File parsers/parse_rb.rb, line 588 588: def lex_init() 589: @OP = SLex.new 590: # @OP = IRB::SLex.new # 1.8.2 doesn't support #IRB::SLex 591: @OP.def_rules("\0", "\004", "\032") do |chars, io| 592: Token(TkEND_OF_SCRIPT).set_text(chars) 593: end 594: 595: @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io| 596: @space_seen = TRUE 597: while (ch = getc) =~ /[ \t\f\r\13]/ 598: chars << ch 599: end 600: ungetc 601: Token(TkSPACE).set_text(chars) 602: end 603: 604: @OP.def_rule("#") do 605: |op, io| 606: identify_comment 607: end 608: 609: @OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do 610: |op, io| 611: str = op 612: @ltype = "=" 613: 614: 615: begin 616: line = "" 617: begin 618: ch = getc 619: line << ch 620: end until ch == "\n" 621: str << line 622: end until line =~ /^=end/ 623: 624: ungetc 625: 626: @ltype = nil 627: 628: if str =~ /\A=begin\s+rdoc/i 629: str.sub!(/\A=begin.*\n/, '') 630: str.sub!(/^=end.*/m, '') 631: Token(TkCOMMENT).set_text(str) 632: else 633: Token(TkRD_COMMENT)#.set_text(str) 634: end 635: end 636: 637: @OP.def_rule("\n") do 638: print "\\n\n" if RubyLex.debug? 639: case @lex_state 640: when EXPR_BEG, EXPR_FNAME, EXPR_DOT 641: @continue = TRUE 642: else 643: @continue = FALSE 644: @lex_state = EXPR_BEG 645: end 646: Token(TkNL).set_text("\n") 647: end 648: 649: @OP.def_rules("*", "**", 650: "!", "!=", "!~", 651: "=", "==", "===", 652: "=~", "<=>", 653: "<", "<=", 654: ">", ">=", ">>") do 655: |op, io| 656: @lex_state = EXPR_BEG 657: Token(op).set_text(op) 658: end 659: 660: @OP.def_rules("<<") do 661: |op, io| 662: tk = nil 663: if @lex_state != EXPR_END && @lex_state != EXPR_CLASS && 664: (@lex_state != EXPR_ARG || @space_seen) 665: c = peek(0) 666: if /[-\w_\"\'\`]/ =~ c 667: tk = identify_here_document 668: end 669: end 670: if !tk 671: @lex_state = EXPR_BEG 672: tk = Token(op).set_text(op) 673: end 674: tk 675: end 676: 677: @OP.def_rules("'", '"') do 678: |op, io| 679: identify_string(op) 680: end 681: 682: @OP.def_rules("`") do 683: |op, io| 684: if @lex_state == EXPR_FNAME 685: Token(op).set_text(op) 686: else 687: identify_string(op) 688: end 689: end 690: 691: @OP.def_rules('?') do 692: |op, io| 693: if @lex_state == EXPR_END 694: @lex_state = EXPR_BEG 695: Token(TkQUESTION).set_text(op) 696: else 697: ch = getc 698: if @lex_state == EXPR_ARG && ch !~ /\s/ 699: ungetc 700: @lex_state = EXPR_BEG; 701: Token(TkQUESTION).set_text(op) 702: else 703: str = op 704: str << ch 705: if (ch == '\\') #' 706: str << read_escape 707: end 708: @lex_state = EXPR_END 709: Token(TkINTEGER).set_text(str) 710: end 711: end 712: end 713: 714: @OP.def_rules("&", "&&", "|", "||") do 715: |op, io| 716: @lex_state = EXPR_BEG 717: Token(op).set_text(op) 718: end 719: 720: @OP.def_rules("+=", "-=", "*=", "**=", 721: "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do 722: |op, io| 723: @lex_state = EXPR_BEG 724: op =~ /^(.*)=$/ 725: Token(TkOPASGN, $1).set_text(op) 726: end 727: 728: @OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io| 729: Token(TkUPLUS).set_text(op) 730: end 731: 732: @OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io| 733: Token(TkUMINUS).set_text(op) 734: end 735: 736: @OP.def_rules("+", "-") do 737: |op, io| 738: catch(:RET) do 739: if @lex_state == EXPR_ARG 740: if @space_seen and peek(0) =~ /[0-9]/ 741: throw :RET, identify_number(op) 742: else 743: @lex_state = EXPR_BEG 744: end 745: elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/ 746: throw :RET, identify_number(op) 747: else 748: @lex_state = EXPR_BEG 749: end 750: Token(op).set_text(op) 751: end 752: end 753: 754: @OP.def_rule(".") do 755: @lex_state = EXPR_BEG 756: if peek(0) =~ /[0-9]/ 757: ungetc 758: identify_number("") 759: else 760: # for obj.if 761: @lex_state = EXPR_DOT 762: Token(TkDOT).set_text(".") 763: end 764: end 765: 766: @OP.def_rules("..", "...") do 767: |op, io| 768: @lex_state = EXPR_BEG 769: Token(op).set_text(op) 770: end 771: 772: lex_int2 773: end
# File parsers/parse_rb.rb, line 775 775: def lex_int2 776: @OP.def_rules("]", "}", ")") do 777: |op, io| 778: @lex_state = EXPR_END 779: @indent -= 1 780: Token(op).set_text(op) 781: end 782: 783: @OP.def_rule(":") do 784: if @lex_state == EXPR_END || peek(0) =~ /\s/ 785: @lex_state = EXPR_BEG 786: tk = Token(TkCOLON) 787: else 788: @lex_state = EXPR_FNAME; 789: tk = Token(TkSYMBEG) 790: end 791: tk.set_text(":") 792: end 793: 794: @OP.def_rule("::") do 795: # p @lex_state.id2name, @space_seen 796: if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen 797: @lex_state = EXPR_BEG 798: tk = Token(TkCOLON3) 799: else 800: @lex_state = EXPR_DOT 801: tk = Token(TkCOLON2) 802: end 803: tk.set_text("::") 804: end 805: 806: @OP.def_rule("/") do 807: |op, io| 808: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 809: identify_string(op) 810: elsif peek(0) == '=' 811: getc 812: @lex_state = EXPR_BEG 813: Token(TkOPASGN, :/).set_text("/=") #") 814: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 815: identify_string(op) 816: else 817: @lex_state = EXPR_BEG 818: Token("/").set_text(op) 819: end 820: end 821: 822: @OP.def_rules("^") do 823: @lex_state = EXPR_BEG 824: Token("^").set_text("^") 825: end 826: 827: # @OP.def_rules("^=") do 828: # @lex_state = EXPR_BEG 829: # Token(TkOPASGN, :^) 830: # end 831: 832: @OP.def_rules(",", ";") do 833: |op, io| 834: @lex_state = EXPR_BEG 835: Token(op).set_text(op) 836: end 837: 838: @OP.def_rule("~") do 839: @lex_state = EXPR_BEG 840: Token("~").set_text("~") 841: end 842: 843: @OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do 844: @lex_state = EXPR_BEG 845: Token("~").set_text("~@") 846: end 847: 848: @OP.def_rule("(") do 849: @indent += 1 850: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 851: @lex_state = EXPR_BEG 852: tk = Token(TkfLPAREN) 853: else 854: @lex_state = EXPR_BEG 855: tk = Token(TkLPAREN) 856: end 857: tk.set_text("(") 858: end 859: 860: @OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do 861: Token("[]").set_text("[]") 862: end 863: 864: @OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do 865: Token("[]=").set_text("[]=") 866: end 867: 868: @OP.def_rule("[") do 869: @indent += 1 870: if @lex_state == EXPR_FNAME 871: t = Token(TkfLBRACK) 872: else 873: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 874: t = Token(TkLBRACK) 875: elsif @lex_state == EXPR_ARG && @space_seen 876: t = Token(TkLBRACK) 877: else 878: t = Token(TkfLBRACK) 879: end 880: @lex_state = EXPR_BEG 881: end 882: t.set_text("[") 883: end 884: 885: @OP.def_rule("{") do 886: @indent += 1 887: if @lex_state != EXPR_END && @lex_state != EXPR_ARG 888: t = Token(TkLBRACE) 889: else 890: t = Token(TkfLBRACE) 891: end 892: @lex_state = EXPR_BEG 893: t.set_text("{") 894: end 895: 896: @OP.def_rule('\\') do #' 897: if getc == "\n" 898: @space_seen = true 899: @continue = true 900: Token(TkSPACE).set_text("\\\n") 901: else 902: ungetc 903: Token("\\").set_text("\\") #" 904: end 905: end 906: 907: @OP.def_rule('%') do 908: |op, io| 909: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 910: identify_quotation('%') 911: elsif peek(0) == '=' 912: getc 913: Token(TkOPASGN, "%").set_text("%=") 914: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 915: identify_quotation('%') 916: else 917: @lex_state = EXPR_BEG 918: Token("%").set_text("%") 919: end 920: end 921: 922: @OP.def_rule('$') do #' 923: identify_gvar 924: end 925: 926: @OP.def_rule('@') do 927: if peek(0) =~ /[@\w_]/ 928: ungetc 929: identify_identifier 930: else 931: Token("@").set_text("@") 932: end 933: end 934: 935: # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do 936: # |op, io| 937: # @indent += 1 938: # @lex_state = EXPR_FNAME 939: # # @lex_state = EXPR_END 940: # # until @rests[0] == "\n" or @rests[0] == ";" 941: # # rests.shift 942: # # end 943: # end 944: 945: @OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do 946: throw :eof 947: end 948: 949: @OP.def_rule("") do 950: |op, io| 951: printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug? 952: if peek(0) =~ /[0-9]/ 953: t = identify_number("") 954: elsif peek(0) =~ /[\w_]/ 955: t = identify_identifier 956: end 957: printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug? 958: t 959: end 960: 961: p @OP if RubyLex.debug? 962: end
# File parsers/parse_rb.rb, line 510 510: def peek_equal?(str) 511: @reader.peek_equal(str) 512: end
# File parsers/parse_rb.rb, line 1292 1292: def read_escape 1293: res = "" 1294: case ch = getc 1295: when /[0-7]/ 1296: ungetc ch 1297: 3.times do 1298: case ch = getc 1299: when /[0-7]/ 1300: when nil 1301: break 1302: else 1303: ungetc 1304: break 1305: end 1306: res << ch 1307: end 1308: 1309: when "x" 1310: res << ch 1311: 2.times do 1312: case ch = getc 1313: when /[0-9a-fA-F]/ 1314: when nil 1315: break 1316: else 1317: ungetc 1318: break 1319: end 1320: res << ch 1321: end 1322: 1323: when "M" 1324: res << ch 1325: if (ch = getc) != '-' 1326: ungetc 1327: else 1328: res << ch 1329: if (ch = getc) == "\\" #" 1330: res << ch 1331: res << read_escape 1332: else 1333: res << ch 1334: end 1335: end 1336: 1337: when "C", "c" #, "^" 1338: res << ch 1339: if ch == "C" and (ch = getc) != "-" 1340: ungetc 1341: else 1342: res << ch 1343: if (ch = getc) == "\\" #" 1344: res << ch 1345: res << read_escape 1346: else 1347: res << ch 1348: end 1349: end 1350: else 1351: res << ch 1352: end 1353: res 1354: end
# File parsers/parse_rb.rb, line 1254 1254: def skip_inner_expression 1255: res = "" 1256: nest = 0 1257: while (ch = getc) 1258: res << ch 1259: if ch == '}' 1260: break if nest.zero? 1261: nest -= 1 1262: elsif ch == '{' 1263: nest += 1 1264: end 1265: end 1266: res 1267: end
# File parsers/parse_rb.rb, line 532 532: def token 533: set_token_position(line_no, char_no) 534: begin 535: begin 536: tk = @OP.match(self) 537: @space_seen = tk.kind_of?(TkSPACE) 538: rescue SyntaxError 539: abort if @exception_on_syntax_error 540: tk = TkError.new(line_no, char_no) 541: end 542: end while @skip_space and tk.kind_of?(TkSPACE) 543: if @read_auto_clean_up 544: get_read 545: end 546: # throw :eof unless tk 547: p tk if $DEBUG 548: tk 549: end