Class | RubyLex |
In: |
parsers/parse_rb.rb
|
Parent: | Object |
Lexical analyzer for Ruby source
# File parsers/parse_rb.rb, line 447 447: def initialize(content, options) 448: lex_init 449: 450: @options = options 451: 452: @reader = BufferedReader.new content, @options 453: 454: @exp_line_no = @line_no = 1 455: @base_char_no = 0 456: @indent = 0 457: 458: @ltype = nil 459: @quoted = nil 460: @lex_state = EXPR_BEG 461: @space_seen = false 462: 463: @continue = false 464: @line = "" 465: 466: @skip_space = false 467: @read_auto_clean_up = false 468: @exception_on_syntax_error = true 469: end
# File parsers/parse_rb.rb, line 497 497: def gets 498: c = getc or return 499: l = "" 500: begin 501: l.concat c unless c == "\r" 502: break if c == "\n" 503: end while c = getc 504: l 505: end
# File parsers/parse_rb.rb, line 1273 1273: def identify_comment 1274: @ltype = "#" 1275: comment = "#" 1276: while ch = getc 1277: if ch == "\\" 1278: ch = getc 1279: if ch == "\n" 1280: ch = " " 1281: else 1282: comment << "\\" 1283: end 1284: else 1285: if ch == "\n" 1286: @ltype = nil 1287: ungetc 1288: break 1289: end 1290: end 1291: comment << ch 1292: end 1293: return Token(TkCOMMENT).set_text(comment) 1294: end
# File parsers/parse_rb.rb, line 968 968: def identify_gvar 969: @lex_state = EXPR_END 970: str = "$" 971: 972: tk = case ch = getc 973: when /[~_*$?!@\/\\;,=:<>".]/ #" 974: str << ch 975: Token(TkGVAR, str) 976: 977: when "-" 978: str << "-" << getc 979: Token(TkGVAR, str) 980: 981: when "&", "`", "'", "+" 982: str << ch 983: Token(TkBACK_REF, str) 984: 985: when /[1-9]/ 986: str << ch 987: while (ch = getc) =~ /[0-9]/ 988: str << ch 989: end 990: ungetc 991: Token(TkNTH_REF) 992: when /\w/ 993: ungetc 994: ungetc 995: return identify_identifier 996: else 997: ungetc 998: Token("$") 999: end 1000: tk.set_text(str) 1001: end
# File parsers/parse_rb.rb, line 1078 1078: def identify_here_document 1079: ch = getc 1080: if ch == "-" 1081: ch = getc 1082: indent = true 1083: end 1084: if /['"`]/ =~ ch # ' 1085: lt = ch 1086: quoted = "" 1087: while (c = getc) && c != lt 1088: quoted.concat c 1089: end 1090: else 1091: lt = '"' 1092: quoted = ch.dup 1093: while (c = getc) && c =~ /\w/ 1094: quoted.concat c 1095: end 1096: ungetc 1097: end 1098: 1099: ltback, @ltype = @ltype, lt 1100: reserve = "" 1101: 1102: while ch = getc 1103: reserve << ch 1104: if ch == "\\" #" 1105: ch = getc 1106: reserve << ch 1107: elsif ch == "\n" 1108: break 1109: end 1110: end 1111: 1112: str = "" 1113: while (l = gets) 1114: l.chomp! 1115: l.strip! if indent 1116: break if l == quoted 1117: str << l.chomp << "\n" 1118: end 1119: 1120: @reader.divert_read_from(reserve) 1121: 1122: @ltype = ltback 1123: @lex_state = EXPR_END 1124: Token(Ltype2Token[lt], str).set_text(str.dump) 1125: end
# File parsers/parse_rb.rb, line 1003 1003: def identify_identifier 1004: token = "" 1005: token.concat getc if peek(0) =~ /[$@]/ 1006: token.concat getc if peek(0) == "@" 1007: 1008: while (ch = getc) =~ /\w|_/ 1009: print ":", ch, ":" if RubyLex.debug? 1010: token.concat ch 1011: end 1012: ungetc 1013: 1014: if ch == "!" or ch == "?" 1015: token.concat getc 1016: end 1017: # fix token 1018: 1019: # $stderr.puts "identifier - #{token}, state = #@lex_state" 1020: 1021: case token 1022: when /^\$/ 1023: return Token(TkGVAR, token).set_text(token) 1024: when /^\@/ 1025: @lex_state = EXPR_END 1026: return Token(TkIVAR, token).set_text(token) 1027: end 1028: 1029: if @lex_state != EXPR_DOT 1030: print token, "\n" if RubyLex.debug? 1031: 1032: token_c, *trans = TkReading2Token[token] 1033: if token_c 1034: # reserved word? 1035: 1036: if (@lex_state != EXPR_BEG && 1037: @lex_state != EXPR_FNAME && 1038: trans[1]) 1039: # modifiers 1040: token_c = TkSymbol2Token[trans[1]] 1041: @lex_state = trans[0] 1042: else 1043: if @lex_state != EXPR_FNAME 1044: if ENINDENT_CLAUSE.include?(token) 1045: @indent += 1 1046: elsif DEINDENT_CLAUSE.include?(token) 1047: @indent -= 1 1048: end 1049: @lex_state = trans[0] 1050: else 1051: @lex_state = EXPR_END 1052: end 1053: end 1054: return Token(token_c, token).set_text(token) 1055: end 1056: end 1057: 1058: if @lex_state == EXPR_FNAME 1059: @lex_state = EXPR_END 1060: if peek(0) == '=' 1061: token.concat getc 1062: end 1063: elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT 1064: @lex_state = EXPR_ARG 1065: else 1066: @lex_state = EXPR_END 1067: end 1068: 1069: if token[0, 1] =~ /[A-Z]/ 1070: return Token(TkCONSTANT, token).set_text(token) 1071: elsif token[token.size - 1, 1] =~ /[!?]/ 1072: return Token(TkFID, token).set_text(token) 1073: else 1074: return Token(TkIDENTIFIER, token).set_text(token) 1075: end 1076: end
# File parsers/parse_rb.rb, line 1146 1146: def identify_number(start) 1147: str = start.dup 1148: 1149: if start == "+" or start == "-" or start == "" 1150: start = getc 1151: str << start 1152: end 1153: 1154: @lex_state = EXPR_END 1155: 1156: if start == "0" 1157: if peek(0) == "x" 1158: ch = getc 1159: str << ch 1160: match = /[0-9a-f_]/ 1161: else 1162: match = /[0-7_]/ 1163: end 1164: while ch = getc 1165: if ch !~ match 1166: ungetc 1167: break 1168: else 1169: str << ch 1170: end 1171: end 1172: return Token(TkINTEGER).set_text(str) 1173: end 1174: 1175: type = TkINTEGER 1176: allow_point = TRUE 1177: allow_e = TRUE 1178: while ch = getc 1179: case ch 1180: when /[0-9_]/ 1181: str << ch 1182: 1183: when allow_point && "." 1184: type = TkFLOAT 1185: if peek(0) !~ /[0-9]/ 1186: ungetc 1187: break 1188: end 1189: str << ch 1190: allow_point = false 1191: 1192: when allow_e && "e", allow_e && "E" 1193: str << ch 1194: type = TkFLOAT 1195: if peek(0) =~ /[+-]/ 1196: str << getc 1197: end 1198: allow_e = false 1199: allow_point = false 1200: else 1201: ungetc 1202: break 1203: end 1204: end 1205: Token(type).set_text(str) 1206: end
# File parsers/parse_rb.rb, line 1127 1127: def identify_quotation(initial_char) 1128: ch = getc 1129: if lt = PERCENT_LTYPE[ch] 1130: initial_char += ch 1131: ch = getc 1132: elsif ch =~ /\W/ 1133: lt = "\"" 1134: else 1135: fail SyntaxError, "unknown type of %string ('#{ch}')" 1136: end 1137: # if ch !~ /\W/ 1138: # ungetc 1139: # next 1140: # end 1141: #@ltype = lt 1142: @quoted = ch unless @quoted = PERCENT_PAREN[ch] 1143: identify_string(lt, @quoted, ch, initial_char) 1144: end
# File parsers/parse_rb.rb, line 1208 1208: def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil) 1209: @ltype = ltype 1210: @quoted = quoted 1211: subtype = nil 1212: 1213: str = "" 1214: str << initial_char if initial_char 1215: str << (opener||quoted) 1216: 1217: nest = 0 1218: begin 1219: while ch = getc 1220: str << ch 1221: if @quoted == ch 1222: if nest == 0 1223: break 1224: else 1225: nest -= 1 1226: end 1227: elsif opener == ch 1228: nest += 1 1229: elsif @ltype != "'" && @ltype != "]" and ch == "#" 1230: ch = getc 1231: if ch == "{" 1232: subtype = true 1233: str << ch << skip_inner_expression 1234: else 1235: ungetc(ch) 1236: end 1237: elsif ch == '\\' #' 1238: str << read_escape 1239: end 1240: end 1241: if @ltype == "/" 1242: if peek(0) =~ /i|o|n|e|s/ 1243: str << getc 1244: end 1245: end 1246: if subtype 1247: Token(DLtype2Token[ltype], str) 1248: else 1249: Token(Ltype2Token[ltype], str) 1250: end.set_text(str) 1251: ensure 1252: @ltype = nil 1253: @quoted = nil 1254: @lex_state = EXPR_END 1255: end 1256: end
# File parsers/parse_rb.rb, line 520 520: def lex 521: until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) && 522: !@continue or 523: tk.nil?) 524: end 525: line = get_read 526: 527: if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil? 528: nil 529: else 530: line 531: end 532: end
# File parsers/parse_rb.rb, line 589 589: def lex_init() 590: if RUBY_VERSION.to_f < 1.9 591: @OP = SLex.new 592: else 593: @OP = IRB::SLex.new 594: end 595: @OP.def_rules("\0", "\004", "\032") do |chars, io| 596: Token(TkEND_OF_SCRIPT).set_text(chars) 597: end 598: 599: @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io| 600: @space_seen = TRUE 601: while (ch = getc) =~ /[ \t\f\r\13]/ 602: chars << ch 603: end 604: ungetc 605: Token(TkSPACE).set_text(chars) 606: end 607: 608: @OP.def_rule("#") do 609: |op, io| 610: identify_comment 611: end 612: 613: @OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do 614: |op, io| 615: str = op 616: @ltype = "=" 617: 618: 619: begin 620: line = "" 621: begin 622: ch = getc 623: line << ch 624: end until ch == "\n" 625: str << line 626: end until line =~ /^=end/ 627: 628: ungetc 629: 630: @ltype = nil 631: 632: if str =~ /\A=begin\s+rdoc/i 633: str.sub!(/\A=begin.*\n/, '') 634: str.sub!(/^=end.*/m, '') 635: Token(TkCOMMENT).set_text(str) 636: else 637: Token(TkRD_COMMENT)#.set_text(str) 638: end 639: end 640: 641: @OP.def_rule("\n") do 642: print "\\n\n" if RubyLex.debug? 643: case @lex_state 644: when EXPR_BEG, EXPR_FNAME, EXPR_DOT 645: @continue = TRUE 646: else 647: @continue = FALSE 648: @lex_state = EXPR_BEG 649: end 650: Token(TkNL).set_text("\n") 651: end 652: 653: @OP.def_rules("*", "**", 654: "!", "!=", "!~", 655: "=", "==", "===", 656: "=~", "<=>", 657: "<", "<=", 658: ">", ">=", ">>") do 659: |op, io| 660: @lex_state = EXPR_BEG 661: Token(op).set_text(op) 662: end 663: 664: @OP.def_rules("<<") do 665: |op, io| 666: tk = nil 667: if @lex_state != EXPR_END && @lex_state != EXPR_CLASS && 668: (@lex_state != EXPR_ARG || @space_seen) 669: c = peek(0) 670: if /[-\w_\"\'\`]/ =~ c 671: tk = identify_here_document 672: end 673: end 674: if !tk 675: @lex_state = EXPR_BEG 676: tk = Token(op).set_text(op) 677: end 678: tk 679: end 680: 681: @OP.def_rules("'", '"') do 682: |op, io| 683: identify_string(op) 684: end 685: 686: @OP.def_rules("`") do 687: |op, io| 688: if @lex_state == EXPR_FNAME 689: Token(op).set_text(op) 690: else 691: identify_string(op) 692: end 693: end 694: 695: @OP.def_rules('?') do 696: |op, io| 697: if @lex_state == EXPR_END 698: @lex_state = EXPR_BEG 699: Token(TkQUESTION).set_text(op) 700: else 701: ch = getc 702: if @lex_state == EXPR_ARG && ch !~ /\s/ 703: ungetc 704: @lex_state = EXPR_BEG; 705: Token(TkQUESTION).set_text(op) 706: else 707: str = op 708: str << ch 709: if (ch == '\\') #' 710: str << read_escape 711: end 712: @lex_state = EXPR_END 713: Token(TkINTEGER).set_text(str) 714: end 715: end 716: end 717: 718: @OP.def_rules("&", "&&", "|", "||") do 719: |op, io| 720: @lex_state = EXPR_BEG 721: Token(op).set_text(op) 722: end 723: 724: @OP.def_rules("+=", "-=", "*=", "**=", 725: "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do 726: |op, io| 727: @lex_state = EXPR_BEG 728: op =~ /^(.*)=$/ 729: Token(TkOPASGN, $1).set_text(op) 730: end 731: 732: @OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io| 733: Token(TkUPLUS).set_text(op) 734: end 735: 736: @OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io| 737: Token(TkUMINUS).set_text(op) 738: end 739: 740: @OP.def_rules("+", "-") do 741: |op, io| 742: catch(:RET) do 743: if @lex_state == EXPR_ARG 744: if @space_seen and peek(0) =~ /[0-9]/ 745: throw :RET, identify_number(op) 746: else 747: @lex_state = EXPR_BEG 748: end 749: elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/ 750: throw :RET, identify_number(op) 751: else 752: @lex_state = EXPR_BEG 753: end 754: Token(op).set_text(op) 755: end 756: end 757: 758: @OP.def_rule(".") do 759: @lex_state = EXPR_BEG 760: if peek(0) =~ /[0-9]/ 761: ungetc 762: identify_number("") 763: else 764: # for obj.if 765: @lex_state = EXPR_DOT 766: Token(TkDOT).set_text(".") 767: end 768: end 769: 770: @OP.def_rules("..", "...") do 771: |op, io| 772: @lex_state = EXPR_BEG 773: Token(op).set_text(op) 774: end 775: 776: lex_int2 777: end
# File parsers/parse_rb.rb, line 779 779: def lex_int2 780: @OP.def_rules("]", "}", ")") do 781: |op, io| 782: @lex_state = EXPR_END 783: @indent -= 1 784: Token(op).set_text(op) 785: end 786: 787: @OP.def_rule(":") do 788: if @lex_state == EXPR_END || peek(0) =~ /\s/ 789: @lex_state = EXPR_BEG 790: tk = Token(TkCOLON) 791: else 792: @lex_state = EXPR_FNAME; 793: tk = Token(TkSYMBEG) 794: end 795: tk.set_text(":") 796: end 797: 798: @OP.def_rule("::") do 799: # p @lex_state.id2name, @space_seen 800: if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen 801: @lex_state = EXPR_BEG 802: tk = Token(TkCOLON3) 803: else 804: @lex_state = EXPR_DOT 805: tk = Token(TkCOLON2) 806: end 807: tk.set_text("::") 808: end 809: 810: @OP.def_rule("/") do 811: |op, io| 812: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 813: identify_string(op) 814: elsif peek(0) == '=' 815: getc 816: @lex_state = EXPR_BEG 817: Token(TkOPASGN, :/).set_text("/=") #") 818: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 819: identify_string(op) 820: else 821: @lex_state = EXPR_BEG 822: Token("/").set_text(op) 823: end 824: end 825: 826: @OP.def_rules("^") do 827: @lex_state = EXPR_BEG 828: Token("^").set_text("^") 829: end 830: 831: # @OP.def_rules("^=") do 832: # @lex_state = EXPR_BEG 833: # Token(TkOPASGN, :^) 834: # end 835: 836: @OP.def_rules(",", ";") do 837: |op, io| 838: @lex_state = EXPR_BEG 839: Token(op).set_text(op) 840: end 841: 842: @OP.def_rule("~") do 843: @lex_state = EXPR_BEG 844: Token("~").set_text("~") 845: end 846: 847: @OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do 848: @lex_state = EXPR_BEG 849: Token("~").set_text("~@") 850: end 851: 852: @OP.def_rule("(") do 853: @indent += 1 854: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 855: @lex_state = EXPR_BEG 856: tk = Token(TkfLPAREN) 857: else 858: @lex_state = EXPR_BEG 859: tk = Token(TkLPAREN) 860: end 861: tk.set_text("(") 862: end 863: 864: @OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do 865: Token("[]").set_text("[]") 866: end 867: 868: @OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do 869: Token("[]=").set_text("[]=") 870: end 871: 872: @OP.def_rule("[") do 873: @indent += 1 874: if @lex_state == EXPR_FNAME 875: t = Token(TkfLBRACK) 876: else 877: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 878: t = Token(TkLBRACK) 879: elsif @lex_state == EXPR_ARG && @space_seen 880: t = Token(TkLBRACK) 881: else 882: t = Token(TkfLBRACK) 883: end 884: @lex_state = EXPR_BEG 885: end 886: t.set_text("[") 887: end 888: 889: @OP.def_rule("{") do 890: @indent += 1 891: if @lex_state != EXPR_END && @lex_state != EXPR_ARG 892: t = Token(TkLBRACE) 893: else 894: t = Token(TkfLBRACE) 895: end 896: @lex_state = EXPR_BEG 897: t.set_text("{") 898: end 899: 900: @OP.def_rule('\\') do #' 901: if getc == "\n" 902: @space_seen = true 903: @continue = true 904: Token(TkSPACE).set_text("\\\n") 905: else 906: ungetc 907: Token("\\").set_text("\\") #" 908: end 909: end 910: 911: @OP.def_rule('%') do 912: |op, io| 913: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 914: identify_quotation('%') 915: elsif peek(0) == '=' 916: getc 917: Token(TkOPASGN, "%").set_text("%=") 918: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 919: identify_quotation('%') 920: else 921: @lex_state = EXPR_BEG 922: Token("%").set_text("%") 923: end 924: end 925: 926: @OP.def_rule('$') do #' 927: identify_gvar 928: end 929: 930: @OP.def_rule('@') do 931: if peek(0) =~ /[@\w_]/ 932: ungetc 933: identify_identifier 934: else 935: Token("@").set_text("@") 936: end 937: end 938: 939: # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do 940: # |op, io| 941: # @indent += 1 942: # @lex_state = EXPR_FNAME 943: # # @lex_state = EXPR_END 944: # # until @rests[0] == "\n" or @rests[0] == ";" 945: # # rests.shift 946: # # end 947: # end 948: 949: @OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do 950: throw :eof 951: end 952: 953: @OP.def_rule("") do 954: |op, io| 955: printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug? 956: if peek(0) =~ /[0-9]/ 957: t = identify_number("") 958: elsif peek(0) =~ /[\w_]/ 959: t = identify_identifier 960: end 961: printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug? 962: t 963: end 964: 965: p @OP if RubyLex.debug? 966: end
# File parsers/parse_rb.rb, line 512 512: def peek_equal?(str) 513: @reader.peek_equal(str) 514: end
# File parsers/parse_rb.rb, line 1296 1296: def read_escape 1297: res = "" 1298: case ch = getc 1299: when /[0-7]/ 1300: ungetc ch 1301: 3.times do 1302: case ch = getc 1303: when /[0-7]/ 1304: when nil 1305: break 1306: else 1307: ungetc 1308: break 1309: end 1310: res << ch 1311: end 1312: 1313: when "x" 1314: res << ch 1315: 2.times do 1316: case ch = getc 1317: when /[0-9a-fA-F]/ 1318: when nil 1319: break 1320: else 1321: ungetc 1322: break 1323: end 1324: res << ch 1325: end 1326: 1327: when "M" 1328: res << ch 1329: if (ch = getc) != '-' 1330: ungetc 1331: else 1332: res << ch 1333: if (ch = getc) == "\\" #" 1334: res << ch 1335: res << read_escape 1336: else 1337: res << ch 1338: end 1339: end 1340: 1341: when "C", "c" #, "^" 1342: res << ch 1343: if ch == "C" and (ch = getc) != "-" 1344: ungetc 1345: else 1346: res << ch 1347: if (ch = getc) == "\\" #" 1348: res << ch 1349: res << read_escape 1350: else 1351: res << ch 1352: end 1353: end 1354: else 1355: res << ch 1356: end 1357: res 1358: end
# File parsers/parse_rb.rb, line 1258 1258: def skip_inner_expression 1259: res = "" 1260: nest = 0 1261: while (ch = getc) 1262: res << ch 1263: if ch == '}' 1264: break if nest.zero? 1265: nest -= 1 1266: elsif ch == '{' 1267: nest += 1 1268: end 1269: end 1270: res 1271: end
# File parsers/parse_rb.rb, line 534 534: def token 535: set_token_position(line_no, char_no) 536: begin 537: begin 538: tk = @OP.match(self) 539: @space_seen = tk.kind_of?(TkSPACE) 540: rescue SyntaxError 541: abort if @exception_on_syntax_error 542: tk = TkError.new(line_no, char_no) 543: end 544: end while @skip_space and tk.kind_of?(TkSPACE) 545: if @read_auto_clean_up 546: get_read 547: end 548: # throw :eof unless tk 549: tk 550: end