1 module ddparser.lex;
2 
3 import ddparser.gram;
4 import ddparser.util;
5 import ddparser.lr;
6 
7 import std.stdio;
8 
9 import core.stdc.stdlib;
10 import core.stdc.ctype;
11 
12 enum LIVE_DIFF_IN_TRANSITIONS = false;
13 
14 struct ScanStateTransition {
15   uint			index;
16   VecAction		live_diff;
17   VecAction		accepts_diff;
18 }
19 
20 struct ScanState {
21   uint			index;
22   ScanState*[256] chars;
23   VecAction		accepts;
24   VecAction		live;
25   PointerSet!Action liveSet;
26   ScanStateTransition*[256] transition;
27 }
28 
29 
30 struct NFAState {
31   uint			index;
32   Vec!(NFAState*)[256] chars;
33   Vec!(NFAState*)	epsilon;
34   Vec!(Action*)		accepts;
35   Vec!(Action*)		live;
36 }
37 
38 struct DFAState {
39   Vec!(NFAState*)	states;
40   DFAState*[256] chars;	
41   ScanState		*scan;
42 }
43 
44 alias VecDFAState = Vec!(DFAState *);
45 alias VecNFAState = Vec!(NFAState *);
46 
47 struct LexState {
48   uint nfa_index;
49   VecNFAState allnfas;
50   uint transitions;
51   uint scanners;
52   uint ignore_case;
53 }
54 
55 private NFAState *
56 new_NFAState(LexState *ls) @safe {
57   NFAState *n = new NFAState();
58   n.index = ls.nfa_index++;
59   vec_add(&ls.allnfas, n);
60   return n;
61 }
62 
63 private void
64 free_DFAState(DFAState *y) @safe {
65   vec_free(&y.states);
66   FREE(y);
67 }
68 
69 private void
70 free_VecDFAState(VecDFAState *dfas) @safe {
71   int i;
72   for (i = 0; i < dfas.length; i++)
73     free_DFAState((*dfas)[i]);
74   vec_free(dfas);
75 }
76 
77 private void
78 free_NFAState(NFAState *y) @safe {
79   int i;
80   for (i = 0; i < 256; i++)
81     vec_free(&y.chars[i]);
82   vec_free(&y.epsilon);
83   vec_free(&y.accepts);
84   FREE(y);
85 }
86 
87 private void
88 free_VecNFAState(VecNFAState *nfas) {
89   int i;
90   for (i = 0; i < nfas.n; i++)
91     free_NFAState(nfas.v[i]);
92   vec_free(nfas);
93 }
94 
95 extern(C) private int 
96 nfacmp(const void *ai, const void *aj) {
97   uint32 i = (*cast(NFAState**)ai).index;
98   uint32 j = (*cast(NFAState**)aj).index;
99   return (i > j) ? 1 : ((i < j) ? -1 : 0);
100 }
101 
102 private void
103 nfa_closure(DFAState *x) {
104     int j, k;
105 
106     for (int i = 0; i < x.states.length; ++i) // x.states may change
107         foreach (j; x.states[i].epsilon) {
108             foreach (k; x.states)
109                 if (j == k)
110                     goto Lbreak;
111             vec_add(&x.states, j);
112 Lbreak:;
113         }
114     qsort(x.states.v, x.states.n, (x.states.v[0]).sizeof, &nfacmp);
115 }
116 
117 private bool
118 eq_dfa_state(DFAState *x, DFAState *y) @safe {
119   if (x.states.n != y.states.n)
120     return false;
121   for (int i = 0; i < x.states.n; i++)
122     if (x.states[i] != y.states[i])
123       return false;
124   return true;
125 }
126 
127 private void
128 dfa_to_scanner(ref VecDFAState alldfas, ref ScanState*[] scanner) {
129   assert(scanner.length == 0);
130   for (int i = 0; i < alldfas.n; i++) {
131     alldfas[i].scan = new ScanState();
132     alldfas[i].scan.index = i;
133     scanner ~= alldfas[i].scan;
134   }
135   foreach (i; alldfas) {
136     for (int j = 0; j < 256; j++)
137       if (i.chars[j])
138 	i.scan.chars[j] = i.chars[j].scan;
139     int highest = int.min;
140     foreach (j; i.states)
141       foreach (k; j.accepts) {
142 	int p = k.term.term_priority;
143 	if (highest < p)
144 	  highest = p;
145       }
146     foreach (j; i.states)
147       foreach (k; j.accepts) {
148 	if (k.term.term_priority == highest)
149 	  vec_add(&i.scan.accepts, k);
150       }
151   }
152 }
153 
154 private void
155 nfa_to_scanner(NFAState *n, Scanner *s) {
156   DFAState *x = new DFAState();
157   VecDFAState alldfas;
158 
159   vec_add(&x.states, n);
160   nfa_closure(x);
161   vec_add(&alldfas, x);
162   for (int i = 0; i < alldfas.length; i++) { // alldfas may change while iterating
163       for (int i_char = 0; i_char < 256; i_char++) {
164           PointerSet!NFAState states;
165 
166           foreach (i_state; alldfas[i].states) {
167               foreach (i; i_state.chars[i_char]) {
168                   states.add(i);
169               }
170           }
171 
172           if (!states.isEmpty) {
173               DFAState *y = new DFAState();
174               states.toVec(y.states);
175               nfa_closure(y);
176               foreach (i; alldfas)
177                   if (eq_dfa_state(y, i)) {
178                       free_DFAState(y);
179                       y = i;
180                       goto Lnext;
181                   }
182               vec_add(&alldfas, y);
183 Lnext:
184               alldfas[i].chars[i_char] = y;
185           }
186       }
187   }
188   dfa_to_scanner(alldfas, s.states);
189   free_VecDFAState(&alldfas);
190 }
191 
192 T popFront(T)(ref T[] v) @safe
193 {
194     if (!v.length) return 0;
195     scope(exit) v = v[1 .. $];
196     return v[0];
197 }
198 
199 /* build a NFA for the regular expression */
200 private int
201 build_regex_nfa(LexState *ls, ref const(uint8)[] areg, NFAState *pp, NFAState *nn, Action *trailing) @safe {
202   uint8 c;
203   const(ubyte)[] reg = areg;
204   NFAState *p = pp, s, x, n = nn;
205   int i, has_trailing = 0;
206 
207   s = p;
208   while ((c = reg.popFront()) != 0) {
209       switch(c) {
210           case '(':
211               x = new_NFAState(ls);
212               has_trailing = build_regex_nfa(ls, reg, s, x, trailing) ||
213                   has_trailing;
214               p = s;
215               s = x;
216               break;
217           case ')':
218               goto Lreturn;
219           case '|':
220               vec_add(&s.epsilon, nn);
221               vec_add(&pp.epsilon, (s = new_NFAState(ls)));
222               break;
223           case '[':
224               {
225                   bool[256] mark;
226                   bool reversed = false;
227                   if (reg[0] == '^') {
228                       reg.popFront();
229                       reversed = true;
230                   }
231 
232                   ubyte pc = ubyte.max;
233                   while ((c = reg.popFront()) != 0) {
234                       switch(c) {
235                           case ']':
236                               goto Lsetdone;
237                           case '-':
238                               c = reg.popFront();
239                               if (!c)
240                                   goto Lerror;
241                               if (c == '\\')
242                                   c = reg.popFront();
243                               if (!c)
244                                   goto Lerror;
245                               for (;pc <= c; pc++)
246                                   mark[pc] = true;
247                               break;
248                           case '\\':
249                               c = reg.popFront();
250                               goto default;
251                           default:
252                               pc = c;
253                               mark[c] = true;
254                               break;
255                       }
256                   }
257 Lsetdone:
258                   x = new_NFAState(ls);
259                   for (i = 1; i < 256; i++)
260                       if ((!reversed && mark[i]) || (reversed && !mark[i]))
261                           vec_add(&s.chars[i], x);
262                   p = s;
263                   s = x;
264               }
265               break;
266           case '?':
267               vec_add(&p.epsilon, s);
268               break;
269           case '*':
270               vec_add(&p.epsilon, s);
271               vec_add(&s.epsilon, p);
272               break;
273           case '+':
274               vec_add(&s.epsilon, p);
275               break;
276           case '/':
277               vec_add(&s.accepts, trailing);
278               has_trailing = 1;
279               break;
280           case '\\':
281               c = reg.popFront();
282               if (!c)	
283                   goto Lerror;
284               goto default;
285           default:
286               if (!ls.ignore_case || !isalpha(c))
287                   vec_add(&s.chars[c], (x = new_NFAState(ls))); 
288               else {
289                   vec_add(&s.chars[tolower(c)], (x = new_NFAState(ls))); 
290                   vec_add(&s.chars[toupper(c)], x);
291               }
292               p = s;
293               s = x;
294               break;
295       }
296   }
297  Lreturn:
298   vec_add(&s.epsilon, n);
299   areg = reg;
300   return has_trailing;
301  Lerror:
302   d_fail("bad (part of) regex: %s\n", areg);
303   return has_trailing;
304 }
305 
306 private void
307 action_diff(ref VecAction a, ref VecAction b, ref VecAction c) {
308   int bb = 0, cc = 0;
309   while (1) {
310     if (bb >= b.n)
311       break;
312   Lagainc:
313     if (cc >= c.n) {
314       while (bb < b.n)
315 	vec_add(&a, b[bb++]);
316       break;
317     }
318   Lagainb:
319     if (b[bb].index == c[cc].index) {
320       bb++;
321       cc++;
322       continue;
323     }
324     if (b[bb].index < c[cc].index) {
325       vec_add(&a, b[bb++]);
326       if (bb >= b.n)
327 	break;
328       goto Lagainb;
329     }
330     cc++;
331     goto Lagainc;
332   }
333 }
334 
335 private void
336 action_intersect(ref VecAction a, ref VecAction b, ref VecAction c) {
337   int bb = 0, cc = 0;
338   while (1) {
339     if (bb >= b.n)
340       break;
341   Lagainc:
342     if (cc >= c.n)
343       break;
344   Lagainb:
345     if (b[bb].index == c[cc].index) {
346       vec_add(&a, b[bb++]);
347       cc++;
348       continue;
349     }
350     if (b[bb].index < c[cc].index) {
351       bb++;
352       if (bb >= b.n)
353 	break;
354       goto Lagainb;
355     }
356     cc++;
357     goto Lagainc;
358   }
359 }
360 
361 private void
362 compute_liveness(Scanner *scanner) {
363   /* basis */
364   foreach (ss; scanner.states) {
365     ss.liveSet.unionSet(ss.accepts);
366   }
367   bool changed = true;
368   while (changed) {
369     changed = false;
370     foreach (ss; scanner.states) {
371       for (int j = 0; j < 256; j++) {
372           ScanState* sss = ss.chars[j];
373           if (sss) {
374               if (ss != sss)
375                   if (ss.liveSet.unionSet(sss.liveSet))
376                       changed = true;
377           }
378       }
379     }
380   }
381   foreach (ss; scanner.states) {
382     ss.liveSet.toVec(ss.live);
383     ss.liveSet.clear();
384     sort_VecAction(ss.live);
385   }
386 }
387 
388 uint32
389 trans_hash_fn(ScanStateTransition *a) {
390   uint h = 0;
391 
392   if (LIVE_DIFF_IN_TRANSITIONS)
393     foreach (i; a.live_diff)
394       h += 3 * i.index;
395   foreach (i; a.accepts_diff)
396     h += 3 * i.index;
397   return h;
398 }
399 
400 int
401 trans_cmp_fn(ScanStateTransition *a, ScanStateTransition *b) {
402   int i;
403   
404   if (LIVE_DIFF_IN_TRANSITIONS)
405     if (a.live_diff.n != b.live_diff.n)
406       return 1;
407   if (a.accepts_diff.n != b.accepts_diff.n)
408     return 1;
409   if (LIVE_DIFF_IN_TRANSITIONS)
410     for (i = 0; i < a.live_diff.n; i++)
411       if (a.live_diff[i] != b.live_diff[i])
412 	return 1;
413   for (i = 0; i < a.accepts_diff.n; i++)
414     if (a.accepts_diff[i] != b.accepts_diff[i])
415       return 1;
416   return 0;
417 }
418 
419 alias ScanStateTransitionSet = Set!(ScanStateTransition*, trans_hash_fn, trans_cmp_fn);
420 
421 
422 private void
423 build_transitions(LexState *ls, Scanner *s) {
424   int j;
425   ScanStateTransition *trans = null, x;
426 
427   assert(s.transitions.length == 0);
428   ScanStateTransitionSet transitions;
429 
430   foreach (ss; s.states) {
431       for (j = 0; j < 256; j++) {
432           if (!trans) {
433               trans = new ScanStateTransition();
434           }
435           if (ss.chars[j]) {
436               action_diff(trans.live_diff, ss.live, ss.chars[j].live);
437               action_intersect(trans.accepts_diff, ss.accepts, 
438                       trans.live_diff);
439           }
440           if ((x = transitions.add(trans)) == trans)
441               trans = null;
442           else {
443               vec_free(&trans.live_diff); 
444               vec_free(&trans.accepts_diff);
445           }
446           ss.transition[j] = x;
447       }
448   }
449   if (trans)
450     FREE(trans);
451   transitions.toVec(s.transitions);
452   for (int i = 0; i < s.transitions.n; i++)
453     s.transitions[i].index = i;
454   ls.transitions += s.transitions.n;
455 }
456 
457 private void
458 compute_transitions(LexState *ls, Scanner *s) {
459   compute_liveness(s);
460   build_transitions(ls, s);
461 }
462 
463 private void
464 build_state_scanner(Grammar *g, LexState *ls, State *s) {
465   NFAState *nn, nnn;
466 
467   bool one = false;
468   NFAState *n = new_NFAState(ls);
469   /* first strings since they can be trivially combined as a tree */
470   foreach (a; s.shift_actions) {
471     if (a.kind == ActionKind.ACTION_ACCEPT) {
472       one = true;
473       if (!n.chars[0].n) 
474 	vec_add(&n.chars[0], (nnn = new_NFAState(ls)));
475       else
476 	nnn = n.chars[0][0];
477       vec_add(&nnn.accepts, a);
478     } else if (a.kind == ActionKind.ACTION_SHIFT && a.term.kind == TermKind.TERM_STRING) {
479       one = true;
480       nn = n;
481       if (!a.term.ignore_case) {
482 	foreach (uint8 c; cast(const uint8[])a.term.string_) {
483 	  if (!nn.chars[c].n) 
484 	    vec_add(&nn.chars[c], (nnn = new_NFAState(ls)));
485 	  else
486 	    nnn = nn.chars[c][0];
487 	  nn = nnn;
488 	}
489       } else { /* use new states */
490 	foreach (uint8 c; cast(const uint8[])a.term.string_) {
491       nnn = new_NFAState(ls);
492 	  if (isalpha(c)) {
493 	    vec_add(&nn.chars[toupper(c)], nnn);
494 	    vec_add(&nn.chars[tolower(c)], nnn);
495 	  } else
496 	    vec_add(&nn.chars[c], nnn);
497 	  nn = nnn;
498 	}
499       }
500       vec_add(&nn.accepts, a);
501     }
502   }
503   /* now regexes */
504   foreach (a; s.shift_actions) {
505     if (a.kind == ActionKind.ACTION_SHIFT && a.term.kind == TermKind.TERM_REGEX) {
506       Action *trailing_context = new Action();
507       *trailing_context = *a;
508       trailing_context.kind = ActionKind.ACTION_SHIFT_TRAILING;
509       trailing_context.index = g.action_count++;
510       one = true;
511       const(uint8)[] reg = cast(const uint8[])a.term.string_;
512       nnn = new_NFAState(ls);
513       vec_add(&n.epsilon, nnn);
514       nn = new_NFAState(ls);
515       ls.ignore_case = a.term.ignore_case;
516       if (build_regex_nfa(ls, reg, nnn, nn, trailing_context)) {
517 	a.term.trailing_context = 1;
518 	s.trailing_context = 1;
519     g.actions ~= trailing_context;
520       } else
521 	FREE(trailing_context);
522       vec_add(&nn.accepts, a);
523     }
524   }
525   if (one) {
526     nfa_to_scanner(n, &s.scanner);
527     compute_transitions(ls, &s.scanner);
528   }
529   free_VecNFAState(&ls.allnfas);
530   ls.scanners++;
531 }
532 
533 void 
534 build_scanners(Grammar *g) {
535   int k;
536   LexState *ls = new LexState();
537 
538   /* detect identical scanners */
539   for (int i = 0; i < g.states.length; i++) {
540     State *s = g.states[i];
541     if (s.same_shifts)
542       continue;
543     for (int j = 0; j < i; j++) {
544       if (g.states[j].same_shifts)
545 	continue;
546       if (g.states[j].shift_actions.length != s.shift_actions.length)
547 	continue;
548       if (g.states[j].scan_kind != s.scan_kind)
549 	continue;
550       for (k = 0; k < g.states[j].shift_actions.length; k++)
551 	if (s.shift_actions[k].term != 
552 	    g.states[j].shift_actions[k].term)
553 	  break;
554       if (k >= g.states[j].shift_actions.length) {
555 	s.same_shifts = g.states[j];
556 	break;
557       }
558     }
559   }
560   /* build scanners */
561   foreach (s; g.states) {
562     if (s.shift_actions.length) {
563       if (s.same_shifts)
564 	s.scanner = s.same_shifts.scanner;
565       else 
566 	build_state_scanner(g, ls, s);
567     }
568   }
569   if (!__ctfe && d_verbose_level)
570     writefln("%d scanners %d transitions", ls.scanners, ls.transitions);
571 }
572