1 /* 2 Copyright 2002-2004 John Plevyak, All Rights Reserved 3 */ 4 module ddparser.gram; 5 6 import std.bitmanip; 7 import ddparser.util; 8 import ddparser.dparse_tables; 9 import ddparser.lr; 10 import ddparser.lex; 11 import ddparser.dparse; 12 import ddparser.parse; 13 import std.stdio; 14 import std.json; 15 import std.conv; 16 import ddparser.serialize; 17 import std.string; 18 import std.ascii; 19 import std.algorithm; 20 import std.array; 21 22 enum EOF_SENTINAL = "\377"; 23 enum NO_PROD =0xFFFFFFFF; 24 25 26 alias Item = Elem; 27 28 struct Code { 29 string code; 30 D_ReductionCode _f; 31 32 int line; 33 void serialize(Serializer s) 34 { 35 //s.map(line, "line"); 36 //if (line != -1) 37 // s.mapCString(code, "code"); 38 } 39 40 @property void f(D_ReductionCode c) 41 { 42 _f = c; 43 line = -1; 44 code = ""; 45 } 46 47 @property D_ReductionCode f() 48 { 49 if (line == -1) return _f; 50 return null; 51 } 52 } 53 54 struct Goto { 55 Elem *elem; 56 State *state; 57 } 58 59 enum ActionKind { 60 ACTION_ACCEPT, ACTION_SHIFT, ACTION_REDUCE, ACTION_SHIFT_TRAILING 61 } 62 63 struct Action { 64 ActionKind kind; 65 Term *term; 66 Rule *rule; 67 State *state; 68 uint index; 69 size_t[2] temp_key = [size_t.max]; 70 } 71 72 alias VecAction = Vec!(Action*); 73 74 struct Hint { 75 uint depth; 76 State *state; 77 Rule *rule; 78 } 79 alias VecHint = Vec!(Hint*); 80 81 alias VecScanStateTransition = Vec!(ScanStateTransition*); 82 83 struct Scanner { 84 ScanState*[] states; 85 VecScanStateTransition transitions; 86 } 87 88 struct State { 89 uint index; 90 uint64 hash; 91 Item*[] items; 92 bool[Item*] items_hash; 93 Goto*[] gotos; 94 VecAction shift_actions; 95 VecAction reduce_actions; 96 VecHint right_epsilon_hints; 97 VecHint error_recovery_hints; 98 Scanner scanner; 99 mixin(bitfields!( 100 bool, "accept", 1, 101 uint, "scanner_code", 1, 102 uint, "goto_on_token", 1, 103 uint, "scan_kind", 3, 104 uint, "trailing_context", 1, 105 uint, "", 25 106 )); 107 int goto_table_offset; 108 State *same_shifts; 109 State *reduces_to; 110 Rule *reduces_with; 111 Rule *reduces_to_then_with; 112 } 113 114 enum ASSOC_LEFT =0x0001; 115 enum ASSOC_RIGHT =0x0002; 116 enum ASSOC_NARY =0x0004; 117 enum ASSOC_UNARY =0x0008; 118 enum ASSOC_BINARY =0x0010; 119 120 enum AssocKind { 121 ASSOC_NONE = 0, 122 ASSOC_NARY_LEFT = (ASSOC_NARY|ASSOC_LEFT), 123 ASSOC_NARY_RIGHT = (ASSOC_NARY|ASSOC_RIGHT), 124 ASSOC_UNARY_LEFT = (ASSOC_UNARY|ASSOC_LEFT), 125 ASSOC_UNARY_RIGHT = (ASSOC_UNARY|ASSOC_RIGHT), 126 ASSOC_BINARY_LEFT = (ASSOC_BINARY|ASSOC_LEFT), 127 ASSOC_BINARY_RIGHT = (ASSOC_BINARY|ASSOC_RIGHT), 128 ASSOC_NO = 0x0020 129 } 130 131 @nogc @safe nothrow pure 132 { 133 bool IS_RIGHT_ASSOC(AssocKind _x) { return cast(bool)(_x & ASSOC_RIGHT); } 134 bool IS_LEFT_ASSOC(AssocKind _x) { return cast(bool)(_x & ASSOC_LEFT); } 135 bool IS_NARY_ASSOC(AssocKind _x) { return cast(bool)(_x & ASSOC_NARY); } 136 bool IS_BINARY_ASSOC(AssocKind _x) { return cast(bool)(_x & ASSOC_BINARY); } 137 bool IS_UNARY_ASSOC(AssocKind _x) { return cast(bool)(_x & ASSOC_UNARY); } 138 bool IS_UNARY_BINARY_ASSOC(AssocKind _x) { return IS_UNARY_ASSOC(_x) || IS_BINARY_ASSOC(_x); } 139 bool IS_BINARY_NARY_ASSOC(AssocKind _x) { return IS_NARY_ASSOC(_x) || IS_BINARY_ASSOC(_x); } 140 141 /* not valid for NARY */ 142 bool IS_EXPECT_RIGHT_ASSOC(AssocKind _x) { return _x && _x != AssocKind.ASSOC_UNARY_LEFT; } 143 bool IS_EXPECT_LEFT_ASSOC(AssocKind _x) { return _x && _x != AssocKind.ASSOC_UNARY_RIGHT; } 144 } 145 146 struct Rule { 147 uint index; 148 Production *prod; 149 int op_priority; 150 AssocKind op_assoc; 151 int rule_priority; 152 AssocKind rule_assoc; 153 Elem*[] elems; 154 Elem *end; 155 Code speculative_code; 156 Code final_code; 157 Vec!(Code*) pass_code; 158 int action_index; 159 Rule *same_reduction; 160 161 void serialize(Serializer s) 162 { 163 s.map(index, "index"); 164 s.map(prod, "prod"); 165 s.map(op_priority, "op_priority"); 166 s.map(op_assoc, "op_assoc"); 167 s.map(rule_priority, "rule_priority"); 168 s.map(rule_assoc, "rule_assoc"); 169 s.map(elems, "elems"); 170 s.map(end, "end"); 171 s.map(speculative_code, "speculative_code"); 172 s.map(final_code, "final_code"); 173 s.map(pass_code, "pass_code"); 174 s.map(action_index, "action_index"); 175 s.map(same_reduction, "same_reduction"); 176 } 177 } 178 179 enum TermKind { 180 TERM_STRING, TERM_REGEX, TERM_CODE, TERM_TOKEN 181 } 182 struct Term { 183 TermKind kind; 184 uint index; 185 int term_priority; 186 string term_name; 187 AssocKind op_assoc; 188 int op_priority; 189 string string_; 190 mixin(bitfields!( 191 uint, "scan_kind", 3, 192 uint, "ignore_case", 1, 193 uint, "trailing_context", 1, 194 uint, "", 27 195 )); 196 Production *regex_production; 197 198 void serialize(Serializer s) 199 { 200 s.map(kind, "kind"); 201 s.map(index, "index"); 202 s.map(term_priority, "term_priority"); 203 //s.map(term_name, "term_name"); 204 s.map(op_assoc, "op_assoc"); 205 s.map(op_priority, "op_priority"); 206 //s.map(string_, "string_"); 207 208 mixin(fieldMap!("scan_kind", s)); 209 mixin(fieldMap!("ignore_case", s)); 210 mixin(fieldMap!("trailing_context", s)); 211 s.map(regex_production, "regex_production"); 212 } 213 214 } 215 216 enum DeclarationKind { 217 DECLARE_TOKENIZE, DECLARE_LONGEST_MATCH, DECLARE_ALL_MATCHES, 218 DECLARE_SET_OP_PRIORITY, DECLARE_STATES_FOR_ALL_NTERMS, 219 DECLARE_STATE_FOR, DECLARE_WHITESPACE, DECLARE_SAVE_PARSE_TREE, DECLARE_NUM 220 } 221 222 alias DECLARE_NUM = DeclarationKind.DECLARE_NUM; 223 224 struct Declaration { 225 Elem * elem; 226 DeclarationKind kind; 227 uint index; 228 229 void serialize(Serializer s) 230 { 231 s.map(elem, "elem"); 232 s.map(kind, "kind"); 233 s.map(index, "index"); 234 } 235 } 236 237 enum InternalKind { 238 INTERNAL_NOT, INTERNAL_HIDDEN, INTERNAL_CONDITIONAL, INTERNAL_STAR, INTERNAL_PLUS 239 } 240 241 struct Production { 242 string name; 243 Rule*[] rules; 244 uint index; 245 mixin(bitfields!( 246 uint, "regex", 1, 247 uint, "in_regex", 1, 248 uint, "internal", 3, /* production used for EBNF */ 249 uint, "live", 1, 250 uint, "", 26 251 )); 252 Rule *nullable; /* shortest rule for epsilon reduction */ 253 Production*[DECLARE_NUM] declaration_group; 254 Declaration*[DECLARE_NUM] last_declaration; 255 State *state; /* state for independent parsing of this productions*/ 256 Elem *elem; /* base elem for the item set of the above state */ 257 Term *regex_term; /* regex production terminal */ 258 Production *next; 259 260 void serialize(Serializer s) 261 { 262 //s.map(name, "name"); 263 s.map(rules, "rules"); 264 s.map(index, "index"); 265 mixin(fieldMap!("regex", s)); 266 mixin(fieldMap!("in_regex", s)); 267 mixin(fieldMap!("internal", s)); 268 mixin(fieldMap!("live", s)); 269 s.map(nullable, "nullable"); 270 s.map(declaration_group, "declaration_group"); 271 s.map(last_declaration, "last_declaration"); 272 s.map(state, "state"); 273 s.map(elem, "elem"); 274 s.map(regex_term, "regex_term"); 275 s.map(next, "next"); 276 } 277 } 278 279 280 281 enum ElemKind { 282 ELEM_NTERM, ELEM_TERM, ELEM_UNRESOLVED, ELEM_END 283 } 284 285 bool elemTermOrNtermEqual(in Elem a, in Elem b) 286 { 287 if (a.kind != b.kind) return false; 288 if (__ctfe) 289 { 290 if (a.kind == ElemKind.ELEM_NTERM) return a.e.nterm == b.e.nterm; 291 else if (a.kind == ElemKind.ELEM_TERM) return a.e.term == b.e.term; 292 else return a.e.unresolved == b.e.unresolved; 293 } 294 return a.e.term_or_nterm == b.e.term_or_nterm; 295 } 296 297 struct Elem { 298 ElemKind kind; 299 uint index; 300 Rule *rule; 301 union E { 302 Production *nterm; 303 Term *term; 304 void *term_or_nterm; 305 string unresolved; 306 } 307 E e; 308 309 @property inout(Term)* term() inout @trusted 310 { 311 assert(kind == ElemKind.ELEM_TERM); 312 return e.term; 313 } 314 315 @property void term(Term* t) @trusted 316 { 317 e.term = t; 318 kind = ElemKind.ELEM_TERM; 319 } 320 321 @property inout(Production)* nterm() inout @trusted 322 { 323 assert(kind == ElemKind.ELEM_NTERM); 324 return e.nterm; 325 } 326 327 @property void nterm(Production* p) 328 { 329 kind = ElemKind.ELEM_NTERM; 330 e.nterm = p; 331 } 332 333 string toString() 334 { 335 if (kind == ElemKind.ELEM_NTERM) return "NTERM: " ~ nterm.name; 336 if (kind == ElemKind.ELEM_TERM) return "TERM: " ~ term.term_name; 337 if (kind == ElemKind.ELEM_UNRESOLVED) return "UNRES: " ~ e.unresolved; 338 return "END"; 339 } 340 341 void serialize(Serializer s) 342 { 343 s.map(kind, "kind"); 344 s.map(index, "index"); 345 s.map(rule, "rule"); 346 347 if (kind == ElemKind.ELEM_NTERM) 348 s.map(e.nterm, "nterm"); 349 else if (kind == ElemKind.ELEM_TERM) 350 s.map(e.term, "term"); 351 /* else if (kind == ElemKind.ELEM_UNRESOLVED) */ 352 /* s.map(e.unresolved, "unresolved"); */ 353 } 354 } 355 356 struct Grammar { 357 Production*[] productions; 358 Term*[] terminals; 359 State*[] states; 360 Action*[] actions; 361 Code scanner; 362 /* Code *code; */ 363 /* int ncode; */ 364 Declaration*[] declarations; 365 Vec!(D_Pass *) passes; 366 string default_white_space; 367 /* grammar construction options */ 368 int set_op_priority_from_rule; 369 int right_recursive_BNF; 370 int states_for_whitespace; 371 int states_for_all_nterms; 372 int tokenizer; 373 int longest_match; 374 int save_parse_tree; 375 /* grammar writing options */ 376 int scanner_blocks; 377 int scanner_block_size; 378 /* temporary variables for grammar construction */ 379 Production * p; 380 Rule * r; 381 Elem * e; 382 int action_index; 383 int action_count; 384 int pass_index; 385 int rule_index; 386 387 void serialize(Serializer s) 388 { 389 /* s.mapCArray(code, "code", ncode); */ 390 s.map(scanner, "scanner"); 391 s.map(productions, "productions"); 392 s.map(terminals, "terminals"); 393 s.map(states, "states"); 394 s.map(actions, "actions"); 395 s.map(declarations, "declarations"); 396 s.map(passes, "passes"); 397 //s.mapCString(default_white_space, "default_white_space"); 398 399 400 s.map(set_op_priority_from_rule, "set_op_priority_from_rule"); 401 s.map(right_recursive_BNF, "right_recursive_BNF"); 402 s.map(states_for_whitespace, "states_for_whitespace"); 403 s.map(states_for_all_nterms, "states_for_all_nterms"); 404 s.map(tokenizer, "tokenizer"); 405 s.map(longest_match, "longest_match"); 406 s.map(save_parse_tree, "save_parse_tree"); 407 408 s.map(scanner_blocks, "scanner_blocks"); 409 s.map(scanner_block_size, "scanner_block_size"); 410 } 411 } 412 413 alias D_Grammar = Grammar; 414 415 /* 416 Copyright 2002-2004 John Plevyak, All Rights Reserved 417 */ 418 419 420 immutable string[] action_types = [ "ACCEPT", "SHIFT", "REDUCE" ]; 421 422 423 Production* new_production(Grammar *g, string name) @safe { 424 Production *p = lookup_production(g, name); 425 if (p) { 426 return p; 427 } 428 p = new Production(); 429 g.productions ~= p; 430 p.name = name; 431 return p; 432 } 433 434 Rule * 435 new_rule(Grammar *g, Production *p) @safe { 436 Rule *r = new Rule(); 437 r.prod = p; 438 r.end = new Elem(); 439 r.end.kind = ElemKind.ELEM_END; 440 r.end.rule = r; 441 r.action_index = g.action_index; 442 return r; 443 } 444 445 private Elem * 446 new_elem_term(Term *t, Rule *r) @safe { 447 Elem *e = new Elem(); 448 e.term = t; 449 e.rule = r; 450 r.elems ~= e; 451 return e; 452 } 453 454 Elem * 455 new_elem_nterm(Production *p, Rule *r) @safe { 456 Elem *e = new Elem(); 457 e.kind = ElemKind.ELEM_NTERM; 458 e.e.nterm = p; 459 e.rule = r; 460 return e; 461 } 462 463 private Elem * 464 new_term_string(Grammar *g, string s, Rule *r) @safe 465 { 466 Term *t = new Term(); 467 t.string_ = s; 468 g.terminals ~= t; 469 return new_elem_term(t, r); 470 } 471 472 string escape_string_for_regex(const char[] s) @safe 473 { 474 auto result = appender!string(); 475 result.reserve(s.length * 2); 476 foreach(c; s) 477 { 478 switch (c) 479 { 480 case '(': 481 case ')': 482 case '[': 483 case ']': 484 case '-': 485 case '^': 486 case '*': 487 case '?': 488 case '+': 489 result ~= '\\'; 490 goto default; 491 default: 492 result ~= c; 493 } 494 } 495 496 return result.data; 497 } 498 499 private string unescapeTermString(const(char)[] termString, TermKind kind) @safe 500 { 501 string result; 502 503 uint start = 0; 504 int length; 505 uint base = 0; 506 507 508 for (uint i = 0; i < termString.length; ++i) { 509 if (termString[i] == '\\') { 510 switch (termString[i + 1]) { 511 case '\\': 512 if (kind == TermKind.TERM_STRING) 513 { result ~= '\\'; i++; break; } 514 else 515 goto default; 516 case 'b': result ~= '\b'; i++; break; 517 case 'f': result ~= '\f'; i++; break; 518 case 'n': result ~= '\n'; i++; break; 519 case 'r': result ~= '\r'; i++; break; 520 case 't': result ~= '\t'; i++; break; 521 case 'v': result ~= '\v'; i++; break; 522 case 'a': result ~= '\a'; i++; break; 523 case 'c': assert(false); //return; 524 case '\"': 525 if (kind == TermKind.TERM_REGEX) 526 { result ~= '\"'; i++; break; } 527 else 528 goto default; 529 case '\'': 530 if (kind == TermKind.TERM_STRING) 531 { result ~= '\''; i++; break; } 532 else 533 goto default; 534 case 'x': 535 length = 0; 536 if (termString[i + 2].isHexDigit()) { 537 base = 16; 538 start = i + 2; 539 length++; 540 if (termString[i + 3].isHexDigit()) 541 length++; 542 } 543 i += length + 1; 544 goto Lncont; 545 case 'd': 546 length = 0; 547 if (termString[i + 2].isDigit()) { 548 base = 10; 549 start = i + 2; 550 length++; 551 if ((termString[i + 3]).isDigit()) { 552 length++; 553 if (termString[i + 4].isDigit() && ((termString[i + 2] < '2') || ((termString[i + 2] == '2') && ((termString[i + 3] < '5') || 554 ((termString[i + 3] == '5') && (termString[i + 4] < '6')))))) 555 length++; 556 } 557 } 558 i += length + 1; 559 goto Lncont; 560 case '0': case '1': case '2': case '3': 561 case '4': case '5': case '6': case '7': 562 length = 1; 563 base = 8; 564 start = i + 1; 565 if (termString[i + 2].isDigit() && (termString[i + 2] != '8') && (termString[i + 2] != '9')) { 566 length++; 567 if (termString[i + 3].isDigit() && (termString[i + 3] != '8') && (termString[i + 3] != '9')) { 568 length++; 569 } 570 } 571 i += length; 572 /* fall through */ 573 Lncont: 574 if (length > 0) { 575 result ~= cast(char)termString[start .. start + length].to!ubyte(base); 576 if (i < termString.length) 577 break; 578 d_fail("encountered an escaped null while processing '%s'", termString); 579 } else 580 goto next; 581 break; 582 default: 583 result ~= termString[i]; 584 result ~= termString[i + 1]; 585 i ++; 586 break; 587 } 588 } 589 else 590 { 591 result ~= termString[i]; 592 } 593 next:; 594 } 595 596 if (!result.length) 597 d_fail("empty string after unescape '%s'", termString); 598 599 return result; 600 } 601 602 private void 603 unescape_term_string(Term *t) { 604 t.string_ = unescapeTermString(t.string_, t.kind); 605 } 606 607 Elem * new_string(Grammar *g, string s, Rule *r) 608 { 609 Elem *x = new_term_string(g, s[1 .. $ - 1], r); 610 x.e.term.kind = (s[0] == '"') ? TermKind.TERM_REGEX : TermKind.TERM_STRING; 611 unescape_term_string(x.e.term); 612 return x; 613 } 614 615 Elem * 616 new_utf8_char(Grammar *g, const(char) *s, const(char) *e, Rule *r) { 617 char[4] utf8_code; 618 ulong utf32_code, base, len = 0; 619 for (utf32_code=0, base=1; e>=s+3; base*=16) { 620 e--; 621 if (*e >= '0' && *e <= '9') 622 utf32_code += base * (*e - '0'); 623 else if (*e >= 'a' && *e <= 'f') 624 utf32_code += base * (*e - 'a' + 10); 625 else if (*e >= 'A' && *e <= 'F') 626 utf32_code += base * (*e - 'A' + 10); 627 } 628 if (utf32_code < 0x80) { 629 utf8_code[0] = cast(char)utf32_code; 630 len = 1; 631 } else if (utf32_code < 0x800) { 632 utf8_code[0] = 0xc0 | ((utf32_code >> 6) & 0x1f); 633 utf8_code[1] = 0x80 | (utf32_code & 0x3f); 634 len = 2; 635 } else if (utf32_code < 0x10000) { 636 utf8_code[0] = 0xe0 | ((utf32_code >> 12) & 0x0f); 637 utf8_code[1] = 0x80 | ((utf32_code >> 6) & 0x3f); 638 utf8_code[2] = 0x80 | (utf32_code & 0x3f); 639 len = 3; 640 } else if (utf32_code < 0x02000000) { 641 utf8_code[0] = 0xf0 | ((utf32_code >> 18) & 0x07); 642 utf8_code[1] = 0x80 | ((utf32_code >> 12) & 0x3f); 643 utf8_code[2] = 0x80 | ((utf32_code >> 6) & 0x3f); 644 utf8_code[3] = 0x80 | (utf32_code & 0x3f); 645 len = 4; 646 } else { 647 d_fail("UTF32 Unicode value U+%8X too large for valid UTF-8 encoding (cf. Unicode Spec 4.0, section 3.9)", utf32_code); 648 } 649 Elem *x = new_term_string(g, utf8_code[0 .. len].idup, r); 650 x.e.term.kind = TermKind.TERM_STRING; 651 return x; 652 } 653 654 Elem * 655 new_ident(string s, Rule *r) @safe 656 { 657 Elem *x = new Elem(); 658 x.kind = ElemKind.ELEM_UNRESOLVED; 659 x.e.unresolved = s; 660 x.rule = r; 661 if (r) 662 r.elems ~= x; 663 return x; 664 } 665 666 void 667 new_token(Grammar *g, string s) @safe { 668 Term *t = new Term(); 669 t.string_ = s; 670 g.terminals ~= t; 671 t.kind = TermKind.TERM_TOKEN; 672 } 673 674 Elem * 675 new_code(Grammar *g, string s, Rule *r) { 676 Elem *x = new_term_string(g, s, r); 677 x.e.term.kind = TermKind.TERM_CODE; 678 return x; 679 } 680 681 Elem * 682 dup_elem(Elem *e, Rule *r) @safe { 683 Elem *ee = new Elem(); 684 *ee = *e; 685 ee.rule = r; 686 return ee; 687 } 688 689 void 690 new_declaration(Grammar *g, Elem *e, DeclarationKind kind) { 691 Declaration *d = new Declaration(); 692 d.elem = e; 693 d.kind = kind; 694 d.index = cast(uint)g.declarations.length; 695 g.declarations ~= d; 696 } 697 698 void 699 add_declaration(Grammar *g, string s, DeclarationKind kind, uint line) { 700 if (s.length == 0) { 701 switch (kind) { 702 case DeclarationKind.DECLARE_SET_OP_PRIORITY: g.set_op_priority_from_rule = 1; return; 703 case DeclarationKind.DECLARE_STATES_FOR_ALL_NTERMS: g.states_for_all_nterms = 1; return; 704 case DeclarationKind.DECLARE_LONGEST_MATCH: g.longest_match = 1; break; 705 case DeclarationKind.DECLARE_ALL_MATCHES: g.longest_match = 0; break; 706 case DeclarationKind.DECLARE_TOKENIZE: g.tokenizer = 1; break; 707 case DeclarationKind.DECLARE_SAVE_PARSE_TREE: g.save_parse_tree = 1; return; 708 default: d_fail("declare expects argument, line %d", line); 709 } 710 } 711 switch (kind) { 712 case DeclarationKind.DECLARE_WHITESPACE: g.default_white_space = s; return; 713 case DeclarationKind.DECLARE_SET_OP_PRIORITY: 714 d_fail("declare does not expect argument, line %d", line); 715 break; 716 default: 717 new_declaration(g, new_ident(s, null), kind); 718 break; 719 } 720 } 721 722 D_Pass * 723 find_pass(Grammar *g, const(char)[] name) { 724 name = name.stripLeft(); 725 foreach (p; g.passes) 726 if (p.name == name) 727 return p; 728 return null; 729 } 730 731 void 732 add_pass(Grammar *g, string name, uint kind, uint line) { 733 if (find_pass(g, name)) 734 d_fail("duplicate pass '%s' line %d", name, line); 735 else { 736 D_Pass *p = new D_Pass(); 737 p.name = name; 738 p.kind = kind; 739 p.index = g.pass_index++; 740 vec_add(&g.passes, p); 741 } 742 } 743 744 void 745 add_pass_code(Grammar *g, Rule *r, string name, 746 string code, uint pass_line, uint code_line) 747 { 748 D_Pass *p = find_pass(g, name); 749 if (!p) 750 d_fail("unknown pass '%s' line %d", name, pass_line); 751 while (r.pass_code.length <= p.index) vec_add(&r.pass_code, null); 752 r.pass_code[p.index] = new Code(); 753 r.pass_code[p.index].code = code; 754 r.pass_code[p.index].line = code_line; 755 } 756 757 758 Production * 759 new_internal_production(Grammar *g, Production *p) @safe { 760 string n = p ? p.name : " _synthetic"; 761 string name = n ~ "__" ~ g.productions.length.to!string(); 762 Production *pp = new_production(g, name); 763 pp.internal = InternalKind.INTERNAL_HIDDEN; 764 pp.regex = p ? p.regex : 0; 765 if (p) { 766 bool found = false; 767 Production *tp = null; 768 for (int i = 0; i < g.productions.length; i++) { 769 if (found) { 770 Production *ttp = g.productions[i]; 771 g.productions[i] = tp; 772 tp = ttp; 773 } else if (p == g.productions[i]) { 774 found = true; 775 tp = g.productions[i+1]; 776 g.productions[i+1] = pp; 777 i++; 778 } 779 } 780 } 781 return pp; 782 } 783 784 void 785 conditional_EBNF(Grammar *g) { 786 Production *pp = new_internal_production(g, g.p); 787 pp.internal = InternalKind.INTERNAL_CONDITIONAL; 788 Rule *rr = new_rule(g, pp); 789 rr.elems ~= g.r.elems[$-1]; 790 g.r.elems[$-1].rule = rr; 791 rr.elems[$-1].rule = rr; 792 pp.rules ~= rr; 793 pp.rules ~= new_rule(g, pp); 794 g.r.elems[$-1] = new_elem_nterm(pp, g.r); 795 } 796 797 void 798 star_EBNF(Grammar *g) { 799 Production *pp = new_internal_production(g, g.p); 800 pp.internal = InternalKind.INTERNAL_STAR; 801 Rule *rr = new_rule(g, pp); 802 if (!g.right_recursive_BNF) { 803 rr.elems ~= new_elem_nterm(pp, rr); 804 rr.elems ~= g.r.elems[$-1]; 805 g.r.elems[$-1] = new_elem_nterm(pp, g.r); 806 rr.elems[$-1].rule = rr; 807 } else { 808 rr.elems ~= g.r.elems[$-1]; 809 g.r.elems[$-1] = new_elem_nterm(pp, g.r); 810 rr.elems[$-1].rule = rr; 811 rr.elems ~= new_elem_nterm(pp, rr); 812 } 813 pp.rules ~= rr; 814 pp.rules ~= new_rule(g, pp); 815 } 816 817 void 818 plus_EBNF(Grammar *g) { 819 Production *pp = new_internal_production(g, g.p); 820 pp.internal = InternalKind.INTERNAL_PLUS; 821 Rule *rr = new_rule(g, pp); 822 Elem *elem = g.r.elems[$-1]; 823 if (!g.right_recursive_BNF) { 824 rr.elems ~= new_elem_nterm(pp, rr); 825 rr.elems ~= dup_elem(elem, rr); 826 g.r.elems[$-1] = new_elem_nterm(pp, g.r); 827 if (g.r.rule_priority) { 828 rr.rule_priority = g.r.rule_priority; 829 rr.rule_assoc = AssocKind.ASSOC_NARY_LEFT; 830 } 831 } else { 832 rr.elems ~= dup_elem(elem, rr); 833 g.r.elems[$-1] = new_elem_nterm(pp, g.r); 834 rr.elems ~= new_elem_nterm(pp, rr); 835 if (g.r.rule_priority) { 836 rr.rule_priority = g.r.rule_priority; 837 rr.rule_assoc = AssocKind.ASSOC_NARY_RIGHT; 838 } 839 } 840 pp.rules ~= rr; 841 rr = new_rule(g, pp); 842 rr.elems ~= elem; 843 elem.rule = rr; 844 pp.rules ~= rr; 845 } 846 847 void 848 rep_EBNF(Grammar *g, int min, int max) { 849 if (max < min) max = min; 850 851 Production *pp = new_internal_production(g, g.p); 852 Elem *elem = g.r.elems[$-1]; 853 for (int i = min; i <= max; i++) { 854 Rule *rr = new_rule(g, pp); 855 for (int j = 0; j < i; j++) 856 rr.elems ~= dup_elem(elem, rr); 857 pp.rules ~= rr; 858 } 859 g.r.elems[$-1] = new_elem_nterm(pp, g.r); 860 FREE(elem); 861 } 862 863 void 864 initialize_productions(Grammar *g) @safe { 865 Production *pp = new_production(g, "0 Start"); 866 pp.internal = InternalKind.INTERNAL_HIDDEN; 867 } 868 869 void 870 finish_productions(Grammar *g) { 871 Production *pp = g.productions[0]; 872 Rule *rr = new_rule(g, pp); 873 rr.elems ~= new_elem_nterm(null, rr); 874 pp.rules ~= rr; 875 rr.elems[0].e.nterm = g.productions[1]; 876 } 877 878 Production* lookup_production(Grammar* g, const(char)[] name) @safe 879 { 880 foreach(p; g.productions) 881 if (p.name == name) return p; 882 return null; 883 } 884 885 private Term * 886 lookup_token(Grammar *g, const(char)[] name) @safe { 887 foreach(t; g.terminals) 888 if (t.kind == TermKind.TERM_TOKEN && t.string_ == name) 889 return t; 890 return null; 891 } 892 893 private Term * 894 unique_term(Grammar *g, Term *t) @safe { 895 foreach (i; g.terminals) 896 if (t.kind == i.kind && 897 t.term_priority == i.term_priority && 898 t.term_name == i.term_name && 899 (!g.set_op_priority_from_rule || 900 (t.op_assoc == i.op_assoc && 901 t.op_priority == i.op_priority)) && 902 t.string_ == i.string_) 903 return i; 904 return t; 905 } 906 907 private void 908 compute_nullable(Grammar *g) { 909 /* ensure that the trivial case is the first cause */ 910 foreach(p; g.productions) { 911 foreach (r; p.rules) 912 if (!r.elems.length) { 913 p.nullable = r; 914 break; 915 } 916 } 917 918 bool changed = true; 919 /* transitive closure */ 920 while (changed) { 921 changed = false; 922 foreach (p; g.productions) { 923 if (!p.nullable) 924 foreach (r; p.rules) { 925 foreach (e; r.elems) { 926 if (e.kind != ElemKind.ELEM_NTERM || !e.e.nterm.nullable) 927 goto Lnot_nullable; 928 } 929 changed = true; 930 p.nullable = r; 931 break; 932 } 933 Lnot_nullable:; 934 } 935 } 936 } 937 938 /* 939 verify and cleanup the grammar datastructures 940 - resolve non-terminals 941 - set element indexes 942 */ 943 private void 944 resolve_grammar(Grammar *g) { 945 Production *p, pp; 946 Elem *e; 947 Term *last_term, t; 948 949 g.rule_index = 0; 950 for (int i = 0; i < g.productions.length; i++) { 951 p = g.productions[i]; 952 if (p != lookup_production(g, p.name)) 953 d_fail("duplicate production '%s'", p.name); 954 p.index = i; 955 foreach (r; p.rules) { 956 r.index = g.rule_index++; 957 last_term = null; 958 for (int k = 0; k < r.elems.length; k++) { 959 e = r.elems[k]; 960 e.index = k; 961 if (e.kind == ElemKind.ELEM_UNRESOLVED) { 962 pp = lookup_production(g, e.e.unresolved); 963 if (pp) { 964 e.e.unresolved = null; 965 e.kind = ElemKind.ELEM_NTERM; 966 e.e.nterm = pp; 967 } else 968 { 969 t = lookup_token(g, e.e.unresolved); 970 if (t) { 971 e.e.unresolved = null; 972 e.kind = ElemKind.ELEM_TERM; 973 e.e.term = t; 974 } else { 975 d_fail("unresolved identifier: '%s'", e.e.unresolved); 976 } 977 } 978 } 979 if (e.kind == ElemKind.ELEM_TERM) 980 last_term = e.e.term; 981 } 982 r.end.index = cast(uint)r.elems.length; 983 if (g.set_op_priority_from_rule) { 984 if (last_term && r.rule_assoc) { 985 last_term.op_assoc = r.rule_assoc; 986 last_term.op_priority = r.rule_priority; 987 } 988 } 989 } 990 } 991 for (int i = 0; i < g.terminals.length; i++) 992 g.terminals[i].index = i; 993 compute_nullable(g); 994 } 995 996 private void 997 merge_identical_terminals(Grammar *g) { 998 foreach (p; g.productions) { 999 foreach (r; p.rules) { 1000 foreach (e; r.elems) { 1001 if (e.kind == ElemKind.ELEM_TERM) 1002 e.e.term = unique_term(g, e.e.term); 1003 } 1004 } 1005 } 1006 } 1007 1008 void 1009 print_term(Term *t) { 1010 string s = t.string_ ? escape_string(t.string_) : null; 1011 if (t.term_name) 1012 logf("term_name(\"%s\") ", t.term_name); 1013 else if (t.kind == TermKind.TERM_STRING) { 1014 if (!t.string_.length) 1015 logf("<EOF> "); 1016 else 1017 logf("string(\"%s\") ", s); 1018 } else if (t.kind == TermKind.TERM_REGEX) { 1019 logf("regex(\"%s\") ", s); 1020 } else if (t.kind == TermKind.TERM_CODE) 1021 logf("code(\"%s\") ", s); 1022 else if (t.kind == TermKind.TERM_TOKEN) 1023 logf("token(\"%s\") ", s); 1024 else 1025 d_fail("unknown token kind"); 1026 } 1027 1028 void 1029 print_elem(Elem *ee) { 1030 if (ee.kind == ElemKind.ELEM_TERM) 1031 print_term(ee.e.term); 1032 else if (ee.kind == ElemKind.ELEM_UNRESOLVED) 1033 logf("%s ", ee.e.unresolved); 1034 else 1035 logf("%s ", ee.e.nterm.name); 1036 } 1037 1038 private string 1039 assoc_str(AssocKind e) { 1040 return [ 1041 AssocKind.ASSOC_NONE : "none", 1042 AssocKind.ASSOC_NARY_LEFT: "left" , 1043 AssocKind.ASSOC_NARY_RIGHT: "right" , 1044 AssocKind.ASSOC_UNARY_LEFT: "unary_left" , 1045 AssocKind.ASSOC_UNARY_RIGHT: "unary_right" , 1046 AssocKind.ASSOC_BINARY_LEFT: "binary_left" , 1047 AssocKind.ASSOC_BINARY_RIGHT: "binary_right" , 1048 AssocKind.ASSOC_NO: "noassoc"][e]; 1049 } 1050 1051 void 1052 print_rule(Rule *r) { 1053 int k; 1054 1055 logf("%s: ", r.prod.name); 1056 for (k = 0; k < r.elems.length; k++) 1057 print_elem(r.elems[k]); 1058 if (r.speculative_code.code) 1059 logf("SPECULATIVE_CODE\n%s\nEND CODE\n", r.speculative_code.code); 1060 if (r.final_code.code) 1061 logf("FINAL_CODE\n%s\nEND CODE\n", r.final_code.code); 1062 } 1063 1064 void 1065 print_grammar(Grammar *g) { 1066 uint i, j, k; 1067 Production *pp; 1068 Rule *rr; 1069 1070 if (!g.productions.length) 1071 return; 1072 logf("PRODUCTIONS\n\n"); 1073 for (i = 0; i < g.productions.length; i++) { 1074 pp = g.productions[i]; 1075 logf("%s (%d)\n", pp.name, i); 1076 for (j = 0; j < pp.rules.length; j++) { 1077 rr = pp.rules[j]; 1078 if (!j) 1079 logf("\t: "); 1080 else 1081 logf("\t| "); 1082 for (k = 0; k < rr.elems.length; k++) 1083 print_elem(rr.elems[k]); 1084 if (rr.op_priority) 1085 logf("op %d ", rr.op_priority); 1086 if (rr.op_assoc) 1087 logf("$%s ", assoc_str(rr.op_assoc)); 1088 if (rr.rule_priority) 1089 logf("rule %d ", rr.rule_priority); 1090 if (rr.rule_assoc) 1091 logf("$%s ", assoc_str(rr.rule_assoc)); 1092 if (rr.speculative_code.code) 1093 logf("%s ", rr.speculative_code.code); 1094 if (rr.final_code.code) 1095 logf("%s ", rr.final_code.code); 1096 logf("\n"); 1097 } 1098 logf("\t;\n"); 1099 logf("\n"); 1100 } 1101 logf("TERMINALS\n\n"); 1102 for (i = 0; i < g.terminals.length; i++) { 1103 logf("\t"); 1104 print_term(g.terminals[i]); 1105 logf("(%d)\n", i + g.productions.length); 1106 } 1107 logf("\n"); 1108 } 1109 1110 private void 1111 print_item(Item *i) { 1112 int j, end = 1; 1113 1114 logf("\t%s: ", i.rule.prod.name); 1115 for (j = 0; j < i.rule.elems.length; j++) { 1116 Elem *e = i.rule.elems[j]; 1117 if (i == e) { 1118 logf(". "); 1119 end = 0; 1120 } 1121 print_elem(e); 1122 } 1123 if (end) 1124 logf(". "); 1125 logf("\n"); 1126 } 1127 1128 private void 1129 print_conflict(string kind, int *conflict) { 1130 if (!*conflict) { 1131 logf(" CONFLICT (before precedence and associativity)\n"); 1132 *conflict = 1; 1133 } 1134 logf("\t%s conflict ", kind); 1135 logf("\n"); 1136 } 1137 1138 private void 1139 print_state(State *s) { 1140 int j, conflict = 0; 1141 1142 writefln("STATE %d (%d ITEMS)%s", s.index, s.items.length, 1143 s.accept ? " ACCEPT" : ""); 1144 for (j = 0; j < s.items.length; j++) 1145 print_item(s.items[j]); 1146 if (s.gotos.length) 1147 logf(" GOTO\n"); 1148 for (j = 0; j < s.gotos.length; j++) { 1149 logf("\t"); 1150 print_elem(s.gotos[j].elem); 1151 logf(" : %d\n", s.gotos[j].state.index); 1152 } 1153 logf(" ACTION\n"); 1154 for (j = 0; j < s.reduce_actions.length; j++) { 1155 Action *a = s.reduce_actions[j]; 1156 writefln("\t%s\t", action_types[a.kind]); 1157 print_rule(a.rule); 1158 logf("\n"); 1159 } 1160 for (j = 0; j < s.shift_actions.length; j++) { 1161 Action *a = s.shift_actions[j]; 1162 writefln("\t%s\t", action_types[a.kind]); 1163 if (a.kind == ActionKind.ACTION_SHIFT) { 1164 print_term(a.term); 1165 logf("%d", a.state.index); 1166 } 1167 logf("\n"); 1168 } 1169 if (s.reduce_actions.length > 1) 1170 print_conflict("reduce/reduce", &conflict); 1171 if (s.reduce_actions.length && s.shift_actions.length) 1172 print_conflict("shift/reduce", &conflict); 1173 logf("\n"); 1174 } 1175 1176 void 1177 print_states(Grammar *g) { 1178 int i; 1179 1180 for (i = 0; i < g.states.length; i++) 1181 print_state(g.states[i]); 1182 } 1183 1184 private bool 1185 state_for_declaration(Grammar *g, int iproduction) { 1186 foreach (d; g.declarations) 1187 if (d.kind == DeclarationKind.DECLARE_STATE_FOR && 1188 d.elem.e.nterm.index == iproduction) 1189 return true; 1190 return false; 1191 } 1192 1193 private void 1194 make_elems_for_productions(Grammar *g) { 1195 int i, j, k, l; 1196 Rule *rr; 1197 Production *ppp; 1198 1199 Production *pp = g.productions[0]; 1200 for (i = 0; i < g.productions.length; i++) 1201 if (!g.productions[i].internal) { 1202 if (g.states_for_all_nterms || 1203 state_for_declaration(g, i)) { 1204 /* try to find an existing elem */ 1205 for (j = 0; j < g.productions.length; j++) 1206 for (k = 0; k < g.productions[j].rules.length; k++) { 1207 rr = g.productions[j].rules[k]; 1208 for (l = 0; l < rr.elems.length; l++) 1209 if (rr.elems[l].e.nterm == g.productions[i]) { 1210 g.productions[i].elem = rr.elems[l]; 1211 break; 1212 } 1213 } 1214 if (j >= g.productions.length) { /* not found */ 1215 g.productions[i].elem = 1216 new_elem_nterm(g.productions[i], new_rule(g, pp)); 1217 g.productions[i].elem.rule.index = g.rule_index++; /* fake */ 1218 } 1219 } 1220 } 1221 if (!g.states_for_all_nterms && 1222 g.states_for_whitespace) 1223 { 1224 ppp = lookup_production(g, "whitespace"); 1225 if (ppp) 1226 { 1227 ppp.elem = new_elem_nterm(ppp, new_rule(g, pp)); 1228 ppp.elem.rule.index = g.rule_index++; /* fake */ 1229 } 1230 } 1231 } 1232 1233 private void 1234 convert_regex_production_one(Grammar *g, Production *p) { 1235 size_t l; 1236 Rule *rr; 1237 1238 if (p.regex_term) /* already done */ 1239 return; 1240 1241 bool circular = false; 1242 1243 if (p.in_regex) 1244 d_fail("circular regex production '%s'", p.name); 1245 p.in_regex = 1; 1246 foreach (r; p.rules) { 1247 if (r.final_code.code || (r.speculative_code.code && p.rules.length > 1)) 1248 d_fail("final and/or multi-rule code not permitted in regex productions '%s'", p.name); 1249 foreach (e; r.elems) { 1250 if (e.kind == ElemKind.ELEM_NTERM) { 1251 if (!e.e.nterm.regex) 1252 d_fail("regex production '%s' cannot invoke non-regex production '%s'", 1253 p.name, e.e.nterm.name); 1254 Production *pp = e.e.nterm; 1255 for (l = 0; l < pp.rules.length; l++) 1256 if (pp.rules[l].speculative_code.code || pp.rules[l].final_code.code) 1257 d_fail("code not permitted in rule %d of regex productions '%s'", l, p.name); 1258 if (p != pp) { 1259 convert_regex_production_one(g, pp); 1260 } else { 1261 circular = true; 1262 } 1263 } else { /* e.kind == ElemKind.ELEM_TERM */ 1264 if (e.e.term.kind == TermKind.TERM_CODE || e.e.term.kind == TermKind.TERM_TOKEN) 1265 d_fail("regex production '%s' cannot include scanners or tokens"); 1266 } 1267 } 1268 } 1269 string buffer; 1270 Term *t = new Term(); 1271 t.kind = TermKind.TERM_REGEX; 1272 t.index = cast(uint)g.terminals.length; 1273 t.regex_production = p; 1274 g.terminals ~= t; 1275 p.regex_term = t; 1276 p.regex_term.term_name = p.name; 1277 //Elem *e; 1278 if (circular) { /* attempt to match to regex operators */ 1279 if (p.rules.length != 2) 1280 Lfail: d_fail("unable to resolve circular regex production: '%s'", p.name); 1281 l = p.rules[0].elems.length + p.rules[1].elems.length; 1282 if (l == 2 || l == 3) { 1283 if (p.rules[0].elems.length != 2 && p.rules[1].elems.length != 2) 1284 goto Lfail; 1285 Rule *r = p.rules[0].elems.length == 2 ? p.rules[0] : p.rules[1]; 1286 rr = p.rules[0] == r ? p.rules[1] : p.rules[0]; 1287 if (r.elems[0].e.nterm != p && r.elems[1].e.nterm != p) 1288 goto Lfail; 1289 Elem *e = r.elems[0].e.nterm == p ? r.elems[1] : r.elems[1]; 1290 if (rr.elems.length && e.e.term_or_nterm != rr.elems[0].e.term_or_nterm) 1291 goto Lfail; 1292 t = e.kind == ElemKind.ELEM_TERM ? e.e.term : e.e.nterm.regex_term; 1293 buffer ~= '('; 1294 if (t.kind == TermKind.TERM_STRING) 1295 buffer ~= escape_string_for_regex(t.string_); 1296 else 1297 buffer ~= t.string_; 1298 buffer ~= ')'; 1299 if (l == 2) 1300 { buffer ~= '*'; } 1301 else 1302 { buffer ~= '+'; } 1303 p.regex_term.string_ = buffer; 1304 } else 1305 goto Lfail; 1306 } else { /* handle the base case, p = (r | r'), r = (e e') */ 1307 if (p.rules.length > 1) 1308 { buffer ~= '('; } 1309 for (int j = 0; j < p.rules.length; j++) { 1310 Rule *r = p.rules[j]; 1311 if (r.elems.length > 1) 1312 { buffer ~= '('; } 1313 foreach (e; r.elems) { 1314 t = e.kind == ElemKind.ELEM_TERM ? e.e.term : e.e.nterm.regex_term; 1315 if (t.kind == TermKind.TERM_STRING) 1316 buffer ~= escape_string_for_regex(t.string_); 1317 else 1318 buffer ~= t.string_; 1319 } 1320 if (r.elems.length > 1) 1321 { buffer ~= ')'; } 1322 if (j != p.rules.length - 1) 1323 { buffer ~= '|'; } 1324 } 1325 if (p.rules.length > 1) 1326 { buffer ~= ')'; } 1327 p.regex_term.string_ = buffer; 1328 } 1329 p.in_regex = 0; 1330 } 1331 1332 private void 1333 convert_regex_productions(Grammar *g) { 1334 foreach (p; g.productions) { 1335 if (!p.regex) 1336 continue; 1337 convert_regex_production_one(g, p); 1338 } 1339 foreach (p; g.productions) { 1340 foreach (r; p.rules) { 1341 foreach (e; r.elems) { 1342 if (e.kind == ElemKind.ELEM_NTERM && e.e.nterm.regex_term) { 1343 e.e.term = e.e.nterm.regex_term; 1344 e.kind = ElemKind.ELEM_TERM; 1345 } 1346 } 1347 } 1348 } 1349 } 1350 1351 private void 1352 check_default_actions(Grammar *g) { 1353 Production *pdefault; 1354 1355 pdefault = lookup_production(g, "_"); 1356 if (pdefault && pdefault.rules.length > 1) 1357 d_fail("number of rules in default action != 1"); 1358 } 1359 1360 struct EqState { 1361 State *eq; 1362 Rule *diff_rule; 1363 State *diff_state; 1364 } 1365 1366 void 1367 build_eq(Grammar *g) { 1368 int i, j, k, changed = 1; 1369 State *s, ss; 1370 EqState *e, ee; 1371 1372 EqState[] eq = new EqState[g.states.length]; 1373 while (changed) { 1374 changed = 0; 1375 for (i = 0; i < g.states.length; i++) { 1376 s = g.states[i]; 1377 e = &eq[s.index]; 1378 for (j = i + 1; j < g.states.length; j++) { 1379 ss = g.states[j]; 1380 ee = &eq[ss.index]; 1381 if (e.eq || ee.eq) 1382 continue; 1383 if (s.same_shifts != ss.same_shifts && ss.same_shifts != s) 1384 continue; 1385 /* check gotos */ 1386 if (s.gotos.length != ss.gotos.length) 1387 continue; 1388 for (k = 0; k < s.gotos.length; k++) { 1389 if (elem_symbol(g, s.gotos[k].elem) != elem_symbol(g, ss.gotos[k].elem)) 1390 goto Lcontinue; 1391 if (s.gotos[k].state != ss.gotos[k].state) { 1392 EqState *ge = &eq[s.gotos[k].state.index]; 1393 EqState *gee = &eq[ss.gotos[k].state.index]; 1394 if (ge.eq != ss.gotos[k].state && gee.eq != s.gotos[k].state) 1395 goto Lcontinue; 1396 if ((ee.diff_state && ee.diff_state != eq[ss.gotos[k].state.index].eq) || 1397 (e.diff_state && e.diff_state != eq[s.gotos[k].state.index].eq)) 1398 goto Lcontinue; 1399 /* allow one different state */ 1400 ee.diff_state = ss.gotos[k].state; 1401 e.diff_state = s.gotos[k].state; 1402 } 1403 } 1404 /* check reductions */ 1405 if (s.reduce_actions.length != ss.reduce_actions.length) 1406 continue; 1407 for (k = 0; k < s.reduce_actions.length; k++) { 1408 if (s.reduce_actions[k].rule == ss.reduce_actions[k].rule) 1409 continue; 1410 if (s.reduce_actions[k].rule.prod != 1411 ss.reduce_actions[k].rule.prod) 1412 goto Lcontinue; 1413 if (s.reduce_actions[k].rule.elems.length != 1414 ss.reduce_actions[k].rule.elems.length) { 1415 if ((ee.diff_rule && ee.diff_rule != ss.reduce_actions[k].rule) || 1416 (e.diff_rule && e.diff_rule != s.reduce_actions[k].rule)) 1417 goto Lcontinue; 1418 /* allow one different rule */ 1419 ee.diff_rule = ss.reduce_actions[k].rule; 1420 e.diff_rule = s.reduce_actions[k].rule; 1421 } 1422 } 1423 ee.eq = s; 1424 changed = 1; 1425 Lcontinue:; 1426 } 1427 } 1428 } 1429 for (i = 0; i < g.states.length; i++) { 1430 s = g.states[i]; 1431 e = &eq[s.index]; 1432 if (e.eq) { 1433 if (!__ctfe && d_verbose_level > 2) { 1434 logf("eq %d %d ", s.index, e.eq.index); 1435 if (e.diff_state) 1436 logf("diff state (%d %d) ", 1437 e.diff_state.index, 1438 eq[e.eq.index].diff_state.index); 1439 if (e.diff_rule) { 1440 logf("diff rule "); 1441 logf("[ "); 1442 print_rule(e.diff_rule); 1443 logf("][ "); 1444 print_rule(eq[e.eq.index].diff_rule); 1445 logf("]"); 1446 } 1447 logf("\n"); 1448 } 1449 } 1450 } 1451 for (i = 0; i < g.states.length; i++) { 1452 s = g.states[i]; 1453 e = &eq[s.index]; 1454 if (e.eq && e.diff_state) { 1455 if (eq[e.diff_state.index].diff_rule && 1456 eq[e.diff_state.index].diff_rule.elems.length == 2) 1457 { 1458 s.reduces_to = e.eq; 1459 s.reduces_with = eq[e.eq.index].diff_rule; 1460 s.reduces_to_then_with = e.diff_rule; 1461 } else if (eq[eq[e.eq.index].diff_state.index].diff_rule && 1462 eq[eq[e.eq.index].diff_state.index].diff_rule.elems.length == 2) 1463 { 1464 e.eq.reduces_to = s; 1465 s.reduces_with = e.diff_rule; 1466 s.reduces_to_then_with = eq[e.eq.index].diff_rule; 1467 } 1468 } 1469 } 1470 for (i = 0; i < g.states.length; i++) { 1471 s = g.states[i]; 1472 if (s.reduces_to) 1473 if (d_verbose_level) 1474 logf("reduces_to %d %d\n", s.index, s.reduces_to.index); 1475 } 1476 } 1477 1478 Grammar * 1479 new_D_Grammar() { 1480 return new Grammar(); 1481 } 1482 1483 private void 1484 free_rule(Rule *r) { 1485 int i; 1486 FREE(r.end); 1487 vec_free(&r.pass_code); 1488 FREE(r); 1489 } 1490 1491 void 1492 free_D_Grammar(Grammar *g) { 1493 int i, j, k; 1494 1495 for (i = 0; i < g.productions.length; i++) { 1496 Production *p = g.productions[i]; 1497 for (j = 0; j < p.rules.length; j++) { 1498 Rule *r = p.rules[j]; 1499 if (r == g.r) 1500 g.r = null; 1501 for (k = 0; k < r.elems.length; k++) { 1502 Elem *e = r.elems[k]; 1503 if (e == p.elem) 1504 p.elem = null; 1505 FREE(e); 1506 } 1507 if (r.end == p.elem) 1508 p.elem = null; 1509 free_rule(r); 1510 } 1511 if (p.elem) { 1512 free_rule(p.elem.rule); 1513 FREE(p.elem); 1514 } 1515 FREE(p); 1516 } 1517 for (i = 0; i < g.actions.length; i++) 1518 free_Action(g.actions[i]); 1519 for (i = 0; i < g.states.length; i++) { 1520 State *s = g.states[i]; 1521 for (j = 0; j < s.gotos.length; j++) { 1522 FREE(s.gotos[j].elem); 1523 FREE(s.gotos[j]); 1524 } 1525 for (j = 0; j < s.right_epsilon_hints.length; j++) 1526 FREE(s.right_epsilon_hints[j]); 1527 vec_free(&s.right_epsilon_hints); 1528 for (j = 0; j < s.error_recovery_hints.length; j++) 1529 FREE(s.error_recovery_hints[j]); 1530 vec_free(&s.error_recovery_hints); 1531 if (!s.same_shifts) { 1532 for (j = 0; j < s.scanner.states.length; j++) { 1533 vec_free(&s.scanner.states[j].accepts); 1534 vec_free(&s.scanner.states[j].live); 1535 FREE(s.scanner.states[j]); 1536 } 1537 for (j = 0; j < s.scanner.transitions.length; j++) 1538 if (s.scanner.transitions[j]) { 1539 vec_free(&s.scanner.transitions[j].live_diff); 1540 vec_free(&s.scanner.transitions[j].accepts_diff); 1541 FREE(s.scanner.transitions[j]); 1542 } 1543 vec_free(&s.scanner.transitions); 1544 } 1545 FREE(s); 1546 } 1547 for (i = 0; i < g.declarations.length; i++) { 1548 FREE(g.declarations[i].elem); 1549 FREE(g.declarations[i]); 1550 } 1551 for (i = 0; i < g.passes.length; i++) { 1552 FREE(g.passes[i]); 1553 } 1554 vec_free(&g.passes); 1555 FREE(g); 1556 } 1557 1558 private int 1559 scanner_declaration(Declaration *d) { 1560 switch (d.kind) { 1561 case DeclarationKind.DECLARE_TOKENIZE: 1562 case DeclarationKind.DECLARE_LONGEST_MATCH: 1563 case DeclarationKind.DECLARE_ALL_MATCHES: 1564 return 1; 1565 default: 1566 return 0; 1567 } 1568 } 1569 1570 private void 1571 set_declaration_group(Production *p, Production *root, Declaration *d) { 1572 if (p.declaration_group[d.kind] == root) 1573 return; 1574 if (d.kind == DeclarationKind.DECLARE_TOKENIZE && p.declaration_group[d.kind]) { 1575 d_fail("shared tokenize subtrees"); 1576 return; 1577 } 1578 p.declaration_group[d.kind] = root; 1579 p.last_declaration[d.kind] = d; 1580 foreach (r; p.rules) { 1581 foreach (e; r.elems) 1582 if (e.kind == ElemKind.ELEM_NTERM) 1583 set_declaration_group(e.e.nterm, root, d); 1584 } 1585 } 1586 1587 private void 1588 propogate_declarations(Grammar *g) { 1589 Production *start = g.productions[0]; 1590 1591 /* global defaults */ 1592 if (g.tokenizer) 1593 new_declaration(g, new_elem_nterm(g.productions[0], null), DeclarationKind.DECLARE_TOKENIZE); 1594 if (g.longest_match) 1595 new_declaration(g, new_elem_nterm(g.productions[0], null), DeclarationKind.DECLARE_LONGEST_MATCH); 1596 /* resolve declarations */ 1597 foreach (d; g.declarations) { 1598 Elem *e = d.elem; 1599 if (e.kind == ElemKind.ELEM_UNRESOLVED) { 1600 Production *p; 1601 if (e.e.unresolved.length == 0) 1602 p = g.productions[0]; 1603 else 1604 { 1605 p = lookup_production(g, e.e.unresolved); 1606 if (!p) 1607 d_fail("unresolved declaration '%s'", e.e.unresolved); 1608 } 1609 e.e.unresolved = null; 1610 e.kind = ElemKind.ELEM_NTERM; 1611 e.e.nterm = p; 1612 } 1613 } 1614 /* build declaration groups (covering a production subtrees) */ 1615 foreach (d; g.declarations) { 1616 if (scanner_declaration(d)) { 1617 Production *p = d.elem.e.nterm; 1618 if (p == start) { 1619 foreach (pp; g.productions) { 1620 pp.declaration_group[d.kind] = start; 1621 pp.last_declaration[d.kind] = d; 1622 } 1623 } else 1624 set_declaration_group(p, p, d); 1625 } 1626 } 1627 /* set terminal scan_kind */ 1628 foreach (p; g.productions) { 1629 foreach (r; p.rules) { 1630 foreach (e; r.elems) { 1631 if (e.kind == ElemKind.ELEM_TERM) { 1632 if (!p.declaration_group[DeclarationKind.DECLARE_LONGEST_MATCH] && 1633 !p.declaration_group[DeclarationKind.DECLARE_ALL_MATCHES]) 1634 e.e.term.scan_kind = D_SCAN_DEFAULT; 1635 else if (p.declaration_group[DeclarationKind.DECLARE_LONGEST_MATCH] && 1636 !p.declaration_group[DeclarationKind.DECLARE_ALL_MATCHES]) 1637 e.e.term.scan_kind = D_SCAN_LONGEST; 1638 else if (!p.declaration_group[DeclarationKind.DECLARE_LONGEST_MATCH] && 1639 p.declaration_group[DeclarationKind.DECLARE_ALL_MATCHES]) 1640 e.e.term.scan_kind = D_SCAN_ALL; 1641 else { 1642 if (p.last_declaration[DeclarationKind.DECLARE_LONGEST_MATCH].index > 1643 p.last_declaration[DeclarationKind.DECLARE_ALL_MATCHES].index) 1644 e.e.term.scan_kind = D_SCAN_LONGEST; 1645 else 1646 e.e.term.scan_kind = D_SCAN_ALL; 1647 } 1648 } 1649 } 1650 } 1651 } 1652 } 1653 1654 private void 1655 merge_shift_actions(State *to, State *from) { 1656 foreach (f; from.shift_actions) { 1657 foreach (t; to.shift_actions) 1658 if (f.term == t.term) 1659 goto Lnext; 1660 to.shift_actions ~= f; 1661 Lnext:; 1662 } 1663 } 1664 1665 private void 1666 compute_declaration_states(Grammar *g, Production *p, Declaration *d) { 1667 State *base_s = null; 1668 int scanner = scanner_declaration(d); 1669 1670 foreach (s; g.states) { 1671 if (d.kind == DeclarationKind.DECLARE_TOKENIZE) { 1672 if (!base_s) 1673 base_s = s; 1674 else { 1675 s.same_shifts = base_s; 1676 merge_shift_actions(base_s, s); 1677 } 1678 } 1679 if (scanner) { 1680 foreach (k; s.items) 1681 if (k.kind == ElemKind.ELEM_TERM) 1682 switch (k.e.term.scan_kind) { 1683 case D_SCAN_LONGEST: 1684 if (s.scan_kind == D_SCAN_RESERVED || 1685 s.scan_kind == D_SCAN_LONGEST) 1686 s.scan_kind = D_SCAN_LONGEST; 1687 else 1688 s.scan_kind = D_SCAN_MIXED; 1689 break; 1690 case D_SCAN_ALL: 1691 if (s.scan_kind == D_SCAN_RESERVED || 1692 s.scan_kind == D_SCAN_ALL) 1693 s.scan_kind = D_SCAN_ALL; 1694 else 1695 s.scan_kind = D_SCAN_MIXED; 1696 break; 1697 default: 1698 break; 1699 } 1700 } 1701 } 1702 } 1703 1704 private void 1705 map_declarations_to_states(Grammar *g) { 1706 foreach (s; g.states) { 1707 s.scan_kind = D_SCAN_RESERVED; 1708 } 1709 /* map groups to sets of states */ 1710 foreach (d; g.declarations) 1711 if (scanner_declaration(d)) 1712 compute_declaration_states(g, d.elem.e.nterm, d); 1713 foreach (s; g.states) { 1714 if (s.scan_kind == D_SCAN_RESERVED) 1715 s.scan_kind = D_SCAN_DEFAULT; /* set the default */ 1716 } 1717 } 1718 1719 int 1720 build_grammar(Grammar *g) { 1721 resolve_grammar(g); 1722 convert_regex_productions(g); 1723 propogate_declarations(g); 1724 merge_identical_terminals(g); 1725 make_elems_for_productions(g); 1726 check_default_actions(g); 1727 build_LR_tables(g); 1728 map_declarations_to_states(g); 1729 if (!__ctfe && d_verbose_level) { 1730 logf("%d productions %d terminals %d states %d declarations\n", 1731 g.productions.length, g.terminals.length, g.states.length, 1732 g.declarations.length); 1733 } 1734 if (!__ctfe && d_verbose_level > 1) { 1735 print_grammar(g); 1736 print_states(g); 1737 } 1738 build_scanners(g); 1739 build_eq(g); 1740 return 0; 1741 } 1742 1743 1744 /* Wlodek Bzyl, <matwb@univ.gda.pl> 1745 */ 1746 1747 private void 1748 print_term_escaped(Term *t, int double_escaped) { 1749 string s; 1750 if (t.term_name) { 1751 logf("%s ", t.term_name); 1752 } else if (t.kind == TermKind.TERM_STRING) { 1753 s = t.string_ ? escape_string_single_quote(t.string_) : null; 1754 if (!t.string_) 1755 logf("<EOF> "); 1756 else { 1757 logf("'%s' ", double_escaped ? escape_string_single_quote(s) : s); 1758 if (t.ignore_case) 1759 logf("/i "); 1760 if (t.term_priority) 1761 writefln("%sterm %d ", double_escaped?"#":"$", t.term_priority); 1762 } 1763 } else if (t.kind == TermKind.TERM_REGEX) { 1764 s = t.string_ ? escape_string(t.string_) : null; 1765 //char *s = t.string_; // ? escape_string(t.string_) : null; 1766 string quote = double_escaped ? "\\\"" : "\""; 1767 logf("%s%s%s ", quote, double_escaped ? escape_string(s) : s, quote); 1768 if (t.ignore_case) 1769 logf("/i "); 1770 if (t.term_priority) 1771 writefln("%sterm %d ", double_escaped?"#":"$", t.term_priority); 1772 } else if (t.kind == TermKind.TERM_CODE) { 1773 s = t.string_ ? escape_string(t.string_) : null; 1774 logf("code(\"%s\") ", s); 1775 } else if (t.kind == TermKind.TERM_TOKEN) { 1776 s = t.string_ ? escape_string(t.string_) : null; 1777 logf("%s ", s); 1778 } else 1779 d_fail("unknown token kind"); 1780 } 1781 1782 /* print_elem changed to call print_term_escaped */ 1783 private void 1784 print_element_escaped(Elem *ee, int double_escaped) { 1785 if (ee.kind == ElemKind.ELEM_TERM) 1786 print_term_escaped(ee.e.term, double_escaped); 1787 else if (ee.kind == ElemKind.ELEM_UNRESOLVED) 1788 logf("%s ", ee.e.unresolved); 1789 else 1790 logf("%s ", ee.e.nterm.name); 1791 } 1792 1793 private void 1794 print_production(Production *p) { 1795 uint j, k; 1796 Rule *r; 1797 string[] opening = [ "" , "\n\t [ logf(\"" , "\n\t { logf(\"" ]; 1798 string[] closing = [ "\n" , "\\n\"); ]\n" , "\\n\"); }\n" ]; 1799 string[] middle = [ "\n\t: ", " <- " , " <= " ]; 1800 string[] assoc = [ "$" , "#" , "#" ]; 1801 string speculative_final_closing = "\\n\"); ]"; /* closing[1] without final newline */ 1802 string next_or_rule = "\t| "; 1803 // char *regex_production = " ::= "; 1804 1805 uint variant = 0; 1806 1807 for (j = 0; j < p.rules.length; j++) { 1808 Lmore: 1809 r = p.rules[j]; 1810 if (!j) { 1811 // if (p.regex) { 1812 // logf("%s%s%s", opening[variant], p.name, regex_production); 1813 // } else { 1814 writefln("%s%s%s", opening[variant], p.name, middle[variant]); 1815 // } 1816 } else { 1817 if (variant==0) 1818 writefln("%s", next_or_rule); 1819 else 1820 writefln("%s%s%s", opening[variant], p.name, middle[variant]); 1821 } 1822 1823 for (k = 0; k < r.elems.length; k++) 1824 print_element_escaped(r.elems[k], variant); 1825 1826 if (r.op_assoc) 1827 writefln(" %s%s ", assoc[variant], assoc_str(r.op_assoc)); 1828 if (r.op_priority) 1829 writefln("%d ", r.op_priority); 1830 if (r.rule_assoc) 1831 writefln(" %s%s ", assoc[variant], assoc_str(r.rule_assoc)); 1832 if (r.rule_priority) 1833 writefln("%d ", r.rule_priority); 1834 1835 if ((d_rdebug_grammar_level == 1 && variant == 0) || 1836 (d_rdebug_grammar_level == 3 && variant == 0)) { 1837 variant=1; 1838 goto Lmore; 1839 } 1840 if ((d_rdebug_grammar_level == 2 && variant == 0) || 1841 (d_rdebug_grammar_level == 3 && variant == 1)) { 1842 if (variant==1) 1843 writefln("%s", speculative_final_closing); 1844 variant=2; 1845 goto Lmore; 1846 } 1847 1848 writefln("%s", closing[variant]); 1849 variant = 0; 1850 } 1851 logf("\t;\n"); 1852 logf("\n"); 1853 } 1854 1855 private void 1856 print_productions(Grammar *g) { 1857 uint i; 1858 if (!g.productions.length) { 1859 logf("/*\n There were no productions in the grammar\n*/\n"); 1860 return; 1861 } 1862 for (i = 1; i < g.productions.length; i++) 1863 print_production(g.productions[i]); 1864 } 1865 1866 private void print_declare(string s, string n) { 1867 while(n.length && (n[0].isWhite() || n[0].isDigit())) n = n[1 .. $]; 1868 logf(s, n); 1869 } 1870 1871 private void 1872 print_declarations(Grammar *g) { 1873 int i; 1874 1875 if (g.tokenizer) 1876 logf("${declare tokenize}\n"); 1877 for (i = 0; i < g.declarations.length; i++) { 1878 Declaration *dd = g.declarations[i]; 1879 Elem *ee = dd.elem; 1880 switch (dd.kind) { 1881 case DeclarationKind.DECLARE_LONGEST_MATCH: 1882 if (g.longest_match) 1883 logf("${declare longest_match}\n"); 1884 else 1885 print_declare("${declare longest_match %s}\n", ee.e.nterm.name); 1886 break; 1887 case DeclarationKind.DECLARE_ALL_MATCHES: 1888 if (!g.longest_match) 1889 logf("${declare all_matches}\n"); 1890 else 1891 print_declare("${declare all_matches %s}\n", ee.e.nterm.name); 1892 break; 1893 default: 1894 logf("\n/*\nDeclaration.kind: %d", dd.kind); 1895 logf("\nElem.kind: %d\n*/\n", ee.kind); 1896 } 1897 } 1898 if (g.set_op_priority_from_rule) 1899 logf("${declare set_op_priority_from_rule}\n"); 1900 if (g.states_for_all_nterms) 1901 logf("${declare all_subparsers}\n"); 1902 /* todo: DeclarationKind.DECLARE_STATE_FOR */ 1903 if (g.default_white_space) 1904 logf("${declare whitespace %s}\n", g.default_white_space); 1905 if (g.save_parse_tree) 1906 logf("${declare save_parse_tree}\n"); 1907 /* todo: DECLARE_NUM */ 1908 1909 if (g.scanner.code) 1910 logf("${scanner %s}\n", g.scanner.code); 1911 1912 { int token_exists = 0; 1913 for (i = 0; i < g.terminals.length; i++) { 1914 Term *t = g.terminals[i]; 1915 if (t.kind == TermKind.TERM_TOKEN) { 1916 writefln("%s %s", token_exists?"":"${token", t.string_); 1917 token_exists = 1; 1918 } 1919 } 1920 if (token_exists) 1921 logf("}\n"); 1922 } 1923 1924 logf("\n"); 1925 } 1926 1927 void 1928 print_rdebug_grammar(Grammar *g) { 1929 logf("/*\n Generated by Make DParser\n"); 1930 logf(" Available at http://dparser.sf.net\n*/\n\n"); 1931 1932 print_declarations(g); 1933 print_productions(g); 1934 } 1935