1 module ddparser.lex; 2 3 import ddparser.gram; 4 import ddparser.util; 5 import ddparser.lr; 6 7 import std.stdio; 8 9 import core.stdc.stdlib; 10 import core.stdc.ctype; 11 12 enum LIVE_DIFF_IN_TRANSITIONS = false; 13 14 struct ScanStateTransition { 15 uint index; 16 VecAction live_diff; 17 VecAction accepts_diff; 18 } 19 20 struct ScanState { 21 uint index; 22 ScanState*[256] chars; 23 VecAction accepts; 24 VecAction live; 25 PointerSet!Action liveSet; 26 ScanStateTransition*[256] transition; 27 } 28 29 30 struct NFAState { 31 uint index; 32 Vec!(NFAState*)[256] chars; 33 Vec!(NFAState*) epsilon; 34 Vec!(Action*) accepts; 35 Vec!(Action*) live; 36 } 37 38 struct DFAState { 39 Vec!(NFAState*) states; 40 DFAState*[256] chars; 41 ScanState *scan; 42 } 43 44 alias VecDFAState = Vec!(DFAState *); 45 alias VecNFAState = Vec!(NFAState *); 46 47 struct LexState { 48 uint nfa_index; 49 VecNFAState allnfas; 50 uint transitions; 51 uint scanners; 52 uint ignore_case; 53 } 54 55 private NFAState * 56 new_NFAState(LexState *ls) @safe { 57 NFAState *n = new NFAState(); 58 n.index = ls.nfa_index++; 59 vec_add(&ls.allnfas, n); 60 return n; 61 } 62 63 private void 64 free_DFAState(DFAState *y) @safe { 65 vec_free(&y.states); 66 FREE(y); 67 } 68 69 private void 70 free_VecDFAState(VecDFAState *dfas) @safe { 71 int i; 72 for (i = 0; i < dfas.length; i++) 73 free_DFAState((*dfas)[i]); 74 vec_free(dfas); 75 } 76 77 private void 78 free_NFAState(NFAState *y) @safe { 79 int i; 80 for (i = 0; i < 256; i++) 81 vec_free(&y.chars[i]); 82 vec_free(&y.epsilon); 83 vec_free(&y.accepts); 84 FREE(y); 85 } 86 87 private void 88 free_VecNFAState(VecNFAState *nfas) { 89 int i; 90 for (i = 0; i < nfas.n; i++) 91 free_NFAState(nfas.v[i]); 92 vec_free(nfas); 93 } 94 95 extern(C) private int 96 nfacmp(const void *ai, const void *aj) { 97 uint32 i = (*cast(NFAState**)ai).index; 98 uint32 j = (*cast(NFAState**)aj).index; 99 return (i > j) ? 1 : ((i < j) ? -1 : 0); 100 } 101 102 private void 103 nfa_closure(DFAState *x) { 104 int j, k; 105 106 for (int i = 0; i < x.states.length; ++i) // x.states may change 107 foreach (j; x.states[i].epsilon) { 108 foreach (k; x.states) 109 if (j == k) 110 goto Lbreak; 111 vec_add(&x.states, j); 112 Lbreak:; 113 } 114 qsort(x.states.v, x.states.n, (x.states.v[0]).sizeof, &nfacmp); 115 } 116 117 private bool 118 eq_dfa_state(DFAState *x, DFAState *y) @safe { 119 if (x.states.n != y.states.n) 120 return false; 121 for (int i = 0; i < x.states.n; i++) 122 if (x.states[i] != y.states[i]) 123 return false; 124 return true; 125 } 126 127 private void 128 dfa_to_scanner(ref VecDFAState alldfas, ref ScanState*[] scanner) { 129 assert(scanner.length == 0); 130 for (int i = 0; i < alldfas.n; i++) { 131 alldfas[i].scan = new ScanState(); 132 alldfas[i].scan.index = i; 133 scanner ~= alldfas[i].scan; 134 } 135 foreach (i; alldfas) { 136 for (int j = 0; j < 256; j++) 137 if (i.chars[j]) 138 i.scan.chars[j] = i.chars[j].scan; 139 int highest = int.min; 140 foreach (j; i.states) 141 foreach (k; j.accepts) { 142 int p = k.term.term_priority; 143 if (highest < p) 144 highest = p; 145 } 146 foreach (j; i.states) 147 foreach (k; j.accepts) { 148 if (k.term.term_priority == highest) 149 vec_add(&i.scan.accepts, k); 150 } 151 } 152 } 153 154 private void 155 nfa_to_scanner(NFAState *n, Scanner *s) { 156 DFAState *x = new DFAState(); 157 VecDFAState alldfas; 158 159 vec_add(&x.states, n); 160 nfa_closure(x); 161 vec_add(&alldfas, x); 162 for (int i = 0; i < alldfas.length; i++) { // alldfas may change while iterating 163 for (int i_char = 0; i_char < 256; i_char++) { 164 PointerSet!NFAState states; 165 166 foreach (i_state; alldfas[i].states) { 167 foreach (i; i_state.chars[i_char]) { 168 states.add(i); 169 } 170 } 171 172 if (!states.isEmpty) { 173 DFAState *y = new DFAState(); 174 states.toVec(y.states); 175 nfa_closure(y); 176 foreach (i; alldfas) 177 if (eq_dfa_state(y, i)) { 178 free_DFAState(y); 179 y = i; 180 goto Lnext; 181 } 182 vec_add(&alldfas, y); 183 Lnext: 184 alldfas[i].chars[i_char] = y; 185 } 186 } 187 } 188 dfa_to_scanner(alldfas, s.states); 189 free_VecDFAState(&alldfas); 190 } 191 192 T popFront(T)(ref T[] v) @safe 193 { 194 if (!v.length) return 0; 195 scope(exit) v = v[1 .. $]; 196 return v[0]; 197 } 198 199 /* build a NFA for the regular expression */ 200 private int 201 build_regex_nfa(LexState *ls, ref const(uint8)[] areg, NFAState *pp, NFAState *nn, Action *trailing) @safe { 202 uint8 c; 203 const(ubyte)[] reg = areg; 204 NFAState *p = pp, s, x, n = nn; 205 int i, has_trailing = 0; 206 207 s = p; 208 while ((c = reg.popFront()) != 0) { 209 switch(c) { 210 case '(': 211 x = new_NFAState(ls); 212 has_trailing = build_regex_nfa(ls, reg, s, x, trailing) || 213 has_trailing; 214 p = s; 215 s = x; 216 break; 217 case ')': 218 goto Lreturn; 219 case '|': 220 vec_add(&s.epsilon, nn); 221 vec_add(&pp.epsilon, (s = new_NFAState(ls))); 222 break; 223 case '[': 224 { 225 bool[256] mark; 226 bool reversed = false; 227 if (reg[0] == '^') { 228 reg.popFront(); 229 reversed = true; 230 } 231 232 ubyte pc = ubyte.max; 233 while ((c = reg.popFront()) != 0) { 234 switch(c) { 235 case ']': 236 goto Lsetdone; 237 case '-': 238 c = reg.popFront(); 239 if (!c) 240 goto Lerror; 241 if (c == '\\') 242 c = reg.popFront(); 243 if (!c) 244 goto Lerror; 245 for (;pc <= c; pc++) 246 mark[pc] = true; 247 break; 248 case '\\': 249 c = reg.popFront(); 250 goto default; 251 default: 252 pc = c; 253 mark[c] = true; 254 break; 255 } 256 } 257 Lsetdone: 258 x = new_NFAState(ls); 259 for (i = 1; i < 256; i++) 260 if ((!reversed && mark[i]) || (reversed && !mark[i])) 261 vec_add(&s.chars[i], x); 262 p = s; 263 s = x; 264 } 265 break; 266 case '?': 267 vec_add(&p.epsilon, s); 268 break; 269 case '*': 270 vec_add(&p.epsilon, s); 271 vec_add(&s.epsilon, p); 272 break; 273 case '+': 274 vec_add(&s.epsilon, p); 275 break; 276 case '/': 277 vec_add(&s.accepts, trailing); 278 has_trailing = 1; 279 break; 280 case '\\': 281 c = reg.popFront(); 282 if (!c) 283 goto Lerror; 284 goto default; 285 default: 286 if (!ls.ignore_case || !isalpha(c)) 287 vec_add(&s.chars[c], (x = new_NFAState(ls))); 288 else { 289 vec_add(&s.chars[tolower(c)], (x = new_NFAState(ls))); 290 vec_add(&s.chars[toupper(c)], x); 291 } 292 p = s; 293 s = x; 294 break; 295 } 296 } 297 Lreturn: 298 vec_add(&s.epsilon, n); 299 areg = reg; 300 return has_trailing; 301 Lerror: 302 d_fail("bad (part of) regex: %s\n", areg); 303 return has_trailing; 304 } 305 306 private void 307 action_diff(ref VecAction a, ref VecAction b, ref VecAction c) { 308 int bb = 0, cc = 0; 309 while (1) { 310 if (bb >= b.n) 311 break; 312 Lagainc: 313 if (cc >= c.n) { 314 while (bb < b.n) 315 vec_add(&a, b[bb++]); 316 break; 317 } 318 Lagainb: 319 if (b[bb].index == c[cc].index) { 320 bb++; 321 cc++; 322 continue; 323 } 324 if (b[bb].index < c[cc].index) { 325 vec_add(&a, b[bb++]); 326 if (bb >= b.n) 327 break; 328 goto Lagainb; 329 } 330 cc++; 331 goto Lagainc; 332 } 333 } 334 335 private void 336 action_intersect(ref VecAction a, ref VecAction b, ref VecAction c) { 337 int bb = 0, cc = 0; 338 while (1) { 339 if (bb >= b.n) 340 break; 341 Lagainc: 342 if (cc >= c.n) 343 break; 344 Lagainb: 345 if (b[bb].index == c[cc].index) { 346 vec_add(&a, b[bb++]); 347 cc++; 348 continue; 349 } 350 if (b[bb].index < c[cc].index) { 351 bb++; 352 if (bb >= b.n) 353 break; 354 goto Lagainb; 355 } 356 cc++; 357 goto Lagainc; 358 } 359 } 360 361 private void 362 compute_liveness(Scanner *scanner) { 363 /* basis */ 364 foreach (ss; scanner.states) { 365 ss.liveSet.unionSet(ss.accepts); 366 } 367 bool changed = true; 368 while (changed) { 369 changed = false; 370 foreach (ss; scanner.states) { 371 for (int j = 0; j < 256; j++) { 372 ScanState* sss = ss.chars[j]; 373 if (sss) { 374 if (ss != sss) 375 if (ss.liveSet.unionSet(sss.liveSet)) 376 changed = true; 377 } 378 } 379 } 380 } 381 foreach (ss; scanner.states) { 382 ss.liveSet.toVec(ss.live); 383 ss.liveSet.clear(); 384 sort_VecAction(ss.live); 385 } 386 } 387 388 uint32 389 trans_hash_fn(ScanStateTransition *a) { 390 uint h = 0; 391 392 if (LIVE_DIFF_IN_TRANSITIONS) 393 foreach (i; a.live_diff) 394 h += 3 * i.index; 395 foreach (i; a.accepts_diff) 396 h += 3 * i.index; 397 return h; 398 } 399 400 int 401 trans_cmp_fn(ScanStateTransition *a, ScanStateTransition *b) { 402 int i; 403 404 if (LIVE_DIFF_IN_TRANSITIONS) 405 if (a.live_diff.n != b.live_diff.n) 406 return 1; 407 if (a.accepts_diff.n != b.accepts_diff.n) 408 return 1; 409 if (LIVE_DIFF_IN_TRANSITIONS) 410 for (i = 0; i < a.live_diff.n; i++) 411 if (a.live_diff[i] != b.live_diff[i]) 412 return 1; 413 for (i = 0; i < a.accepts_diff.n; i++) 414 if (a.accepts_diff[i] != b.accepts_diff[i]) 415 return 1; 416 return 0; 417 } 418 419 alias ScanStateTransitionSet = Set!(ScanStateTransition*, trans_hash_fn, trans_cmp_fn); 420 421 422 private void 423 build_transitions(LexState *ls, Scanner *s) { 424 int j; 425 ScanStateTransition *trans = null, x; 426 427 assert(s.transitions.length == 0); 428 ScanStateTransitionSet transitions; 429 430 foreach (ss; s.states) { 431 for (j = 0; j < 256; j++) { 432 if (!trans) { 433 trans = new ScanStateTransition(); 434 } 435 if (ss.chars[j]) { 436 action_diff(trans.live_diff, ss.live, ss.chars[j].live); 437 action_intersect(trans.accepts_diff, ss.accepts, 438 trans.live_diff); 439 } 440 if ((x = transitions.add(trans)) == trans) 441 trans = null; 442 else { 443 vec_free(&trans.live_diff); 444 vec_free(&trans.accepts_diff); 445 } 446 ss.transition[j] = x; 447 } 448 } 449 if (trans) 450 FREE(trans); 451 transitions.toVec(s.transitions); 452 for (int i = 0; i < s.transitions.n; i++) 453 s.transitions[i].index = i; 454 ls.transitions += s.transitions.n; 455 } 456 457 private void 458 compute_transitions(LexState *ls, Scanner *s) { 459 compute_liveness(s); 460 build_transitions(ls, s); 461 } 462 463 private void 464 build_state_scanner(Grammar *g, LexState *ls, State *s) { 465 NFAState *nn, nnn; 466 467 bool one = false; 468 NFAState *n = new_NFAState(ls); 469 /* first strings since they can be trivially combined as a tree */ 470 foreach (a; s.shift_actions) { 471 if (a.kind == ActionKind.ACTION_ACCEPT) { 472 one = true; 473 if (!n.chars[0].n) 474 vec_add(&n.chars[0], (nnn = new_NFAState(ls))); 475 else 476 nnn = n.chars[0][0]; 477 vec_add(&nnn.accepts, a); 478 } else if (a.kind == ActionKind.ACTION_SHIFT && a.term.kind == TermKind.TERM_STRING) { 479 one = true; 480 nn = n; 481 if (!a.term.ignore_case) { 482 foreach (uint8 c; cast(const uint8[])a.term.string_) { 483 if (!nn.chars[c].n) 484 vec_add(&nn.chars[c], (nnn = new_NFAState(ls))); 485 else 486 nnn = nn.chars[c][0]; 487 nn = nnn; 488 } 489 } else { /* use new states */ 490 foreach (uint8 c; cast(const uint8[])a.term.string_) { 491 nnn = new_NFAState(ls); 492 if (isalpha(c)) { 493 vec_add(&nn.chars[toupper(c)], nnn); 494 vec_add(&nn.chars[tolower(c)], nnn); 495 } else 496 vec_add(&nn.chars[c], nnn); 497 nn = nnn; 498 } 499 } 500 vec_add(&nn.accepts, a); 501 } 502 } 503 /* now regexes */ 504 foreach (a; s.shift_actions) { 505 if (a.kind == ActionKind.ACTION_SHIFT && a.term.kind == TermKind.TERM_REGEX) { 506 Action *trailing_context = new Action(); 507 *trailing_context = *a; 508 trailing_context.kind = ActionKind.ACTION_SHIFT_TRAILING; 509 trailing_context.index = g.action_count++; 510 one = true; 511 const(uint8)[] reg = cast(const uint8[])a.term.string_; 512 nnn = new_NFAState(ls); 513 vec_add(&n.epsilon, nnn); 514 nn = new_NFAState(ls); 515 ls.ignore_case = a.term.ignore_case; 516 if (build_regex_nfa(ls, reg, nnn, nn, trailing_context)) { 517 a.term.trailing_context = 1; 518 s.trailing_context = 1; 519 g.actions ~= trailing_context; 520 } else 521 FREE(trailing_context); 522 vec_add(&nn.accepts, a); 523 } 524 } 525 if (one) { 526 nfa_to_scanner(n, &s.scanner); 527 compute_transitions(ls, &s.scanner); 528 } 529 free_VecNFAState(&ls.allnfas); 530 ls.scanners++; 531 } 532 533 void 534 build_scanners(Grammar *g) { 535 int k; 536 LexState *ls = new LexState(); 537 538 /* detect identical scanners */ 539 for (int i = 0; i < g.states.length; i++) { 540 State *s = g.states[i]; 541 if (s.same_shifts) 542 continue; 543 for (int j = 0; j < i; j++) { 544 if (g.states[j].same_shifts) 545 continue; 546 if (g.states[j].shift_actions.length != s.shift_actions.length) 547 continue; 548 if (g.states[j].scan_kind != s.scan_kind) 549 continue; 550 for (k = 0; k < g.states[j].shift_actions.length; k++) 551 if (s.shift_actions[k].term != 552 g.states[j].shift_actions[k].term) 553 break; 554 if (k >= g.states[j].shift_actions.length) { 555 s.same_shifts = g.states[j]; 556 break; 557 } 558 } 559 } 560 /* build scanners */ 561 foreach (s; g.states) { 562 if (s.shift_actions.length) { 563 if (s.same_shifts) 564 s.scanner = s.same_shifts.scanner; 565 else 566 build_state_scanner(g, ls, s); 567 } 568 } 569 if (!__ctfe && d_verbose_level) 570 writefln("%d scanners %d transitions", ls.scanners, ls.transitions); 571 } 572