7.4.1 Examples

This subsection contains the examples of learning simple grammars for dividing terminal symbol sequences into words. The examples use grammar files located in the directory $prefix/share/qsmm/samples/gram (installed from the directory samples/gram in the package distribution) and auxiliary programs described in pcfg-generate-seq and mk-rg-vit.sh.

To reproduce the examples, execute the following preparation commands in a temporary directory:

$ mkdir seq log td-templ td-learn
$ prefix=/usr  # use a different prefix, if necessary 

Continue by executing commands for the examples below.

Example 1

$ cat "$prefix/share/qsmm/samples/gram/3.pcfg"

  S: A B
  ;
  
  A: "a" "b" "c"
   | "d" "c" "b" "a"
  ;

  B: "e"
   | "f" "e"
  ;

$ pcfg-generate-seq -i1 -n2000 -o seq/3-2k.seq "$prefix/share/qsmm/samples/gram/3.pcfg"
$ mk-rg-vit.sh 6 6 2 >td-templ/3.rg

$ atd-parser -i1 -n20000 --det-niter-goal=50 --od=td-learn/3_det.rg  \
             --oo=log/3.log td-templ/3.rg seq/3-2k.seq

$ atd-parser -i1 --op=td-learn/3_out.pcfg --oo          \
             --simplify td-learn/3_det.rg seq/3-2k.seq

  [0]: p_td 0.70731857, p_rd 1.00000000, p_wp 0.81804014, p_np 0.79479479, cp 0

$ cat td-learn/3_out.pcfg

  S: "d" "c" "b" "a" L2_1  [0.51629073]  // 206
   | "a" "b" "c" L3_1      [0.48370927]  // 193
  ;  // 399

  L2_1: "f" "e"  [0.47572816]  // 98   0.75
      | "e"      [0.52427184]  // 108  0.63533835
  ;  // 206

  L3_1: "f" "e"  [0.50777202]  // 98  0.75
      | "e"      [0.49222798]  // 95  0.61904762
  ;  // 193

Example 2

$ cat "$prefix/share/qsmm/samples/gram/5.pcfg"

  S: B C
   | B C C
   | D C B
   | D D C
  ;
  
  B: "a" "a"
  ;
  
  C: "c" "b" "c"
  ;
  
  D: "d" "b" "b" "d"
  ;

$ pcfg-generate-seq -i1 -n10000 -o seq/5-10k.seq "$prefix/share/qsmm/samples/gram/5.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/5.rg

$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/5_det.rg  \
             --oo=log/5.log td-templ/5.rg seq/5-10k.seq

$ atd-parser -i1 --op=td-learn/5_out.pcfg --oo           \
             --simplify td-learn/5_det.rg seq/5-10k.seq

  [0]: p_td 0.58834791, p_rd 1.00000000, p_wp 0.82003105, p_np 0.81757230, cp 0

$ cat td-learn/5_out.pcfg

  S: "a"              [0.42766108]  // 1812  1
   | "c" "b" "c"      [0.35850838]  // 1519  1
   | "d" "b" "b" "d"  [0.21383054]  // 906   1
  ;  // 4237

Example 3

$ cat "$prefix/share/qsmm/samples/gram/6.pcfg"

  S: B C
   | B D D
   | B E E E
  ;
  
  B: "a" "b"
   | "b" "b"
  ;
  
  C: "a" "c"
   | "c" "c"
  ;
  
  D: "a" "d"
   | "d" "d"
  ;

  E: "a" "e"
   | "e" "e"
  ;

$ pcfg-generate-seq -i1 -n40000 -o seq/6-40k.seq "$prefix/share/qsmm/samples/gram/6.pcfg"
$ cat "$prefix/share/qsmm/samples/gram/6.rg"

  S: ( . .
     | . .
     ) ( . .
       | . . ( . . | . . )
       | . . ( . . | . . ) ( . . | . . )
       | . ( .
           | . ( . . | . . )
           | . ( . . | . . ) ( . . | . . )
           )
       )
  ;

$ atd-parser -i1 -n80000 --det-niter-goal=50 --od=td-learn/6_det.rg               \
             --oo=log/6.log "$prefix/share/qsmm/samples/gram/6.rg" seq/6-40k.seq

$ atd-parser -i1 --or=td-learn/6_resi.rg --oo td-learn/6_det.rg seq/6-40k.seq

  [0]: p_td 0.67223945, p_rd 0.96721932, p_wp 0.70524366, p_np 0.69389439, cp 108

$ cat td-learn/6_resi.rg

  S: ( "b" "b"
     | "a" "b"
     ) ( "a" ( "c"
             | "d" ( "d" "d"
                   | "a" "d"
                   )
             | "e" ( "e" "e"
                   | "a" "e"
                   ) ( "e" "e"
                     | "a" "e"
                     )
             )
       | "d" "d" ( "a" "d"
                 | "d" "d"
                 )
       | "e" "e" ( "e" "e"
                 | "a" "e"
                 ) ( "a" "e"
                   | "e" "e"
                   )
       | "c" "c"
       )
  ;

Example 4

$ cat "$prefix/share/qsmm/samples/gram/9.pcfg"

  S: "l" "a" "z" "y"
   | "f" "o" "x"
   | "f" "o" "x" "y"
  ;

$ pcfg-generate-seq -i1 -n5000 -o seq/9-5k.seq "$prefix/share/qsmm/samples/gram/9.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/9.rg

$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/9_det.rg  \
             --oo=log/9.log td-templ/9.rg seq/9-5k.seq

$ atd-parser -i1 --op=td-learn/9_out.pcfg --oo          \
             --simplify td-learn/9_det.rg seq/9-5k.seq

  [0]: p_td 0.56923116, p_rd 0.77034971, p_wp 0.84928628, p_np 0.84733894, cp 25

$ cat td-learn/9_out.pcfg

  S: "y" "f" "o" "x"  [0.33406917]  // 454  0.96864589
   | "f" "o" "x"      [0.32303164]  // 439  0.73741149
   | "y" "l" "a" "z"  [0.15820456]  // 215  0.50772377
   | "l" "a" "z" "y"  [0.18469463]  // 251
  ;  // 1359

Example 5

$ cat "$prefix/share/qsmm/samples/gram/10.pcfg"

  S: "c" "i" "l" "i"
   | "c" "i" "r" "c"
   | "c" "i" "t" "i"
   | "l" "i" "b" "e"
   | "l" "i" "f" "e"
   | "l" "i" "v" "e"
   | "v" "i" "l" "l"
   | "v" "i" "r" "u"
   | "v" "i" "v" "o"
  ;

$ pcfg-generate-seq -i1 -n40000 -o seq/10-40k.seq "$prefix/share/qsmm/samples/gram/10.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/10.rg

$ atd-parser -i1 -n80000 --det-niter-goal=50 --od=td-learn/10_det.rg  \
             --oo=log/10.log td-templ/10.rg seq/10-40k.seq

$ atd-parser -i1 --op=td-learn/10_out.pcfg --oo            \
             --simplify td-learn/10_det.rg seq/10-40k.seq

  [0]: p_td 0.55845396, p_rd 0.82352984, p_wp 0.67576938, p_np 0.66497500, cp 48

$ cat td-learn/10_out.pcfg

S: "c" "i" "t" "i"  [0.1132]  // 1132  0.67919697
 | "c" "i" "l" "i"  [0.1121]  // 1121  0.67233259
 | "c" "i" "r" "c"  [0.108]   // 1080  0.64847044
 | "l" "i" L3_2     [0.3334]  // 3334
 | "v" "i" L3_0     [0.3333]  // 3333
;  // 10000

L3_0: "v" "o"  [0.3360336]   // 1120  1
    | "r" "u"  [0.33183318]  // 1106  1
    | "l" "l"  [0.33213321]  // 1107
;  // 3333

L3_2: "f" "e"  [0.34523095]  // 1151  1
    | "v" "e"  [0.33023395]  // 1101  1
    | "b" "e"  [0.32453509]  // 1082
;  // 3334

Example 6

$ cat "$prefix/share/qsmm/samples/gram/12.pcfg"

  W: "h" "e" A    [0.67]
   | "d"          [0.20]
   | "i" "k" "j"  [0.13]
  ;
  
  A: "e" "h" "j" [0.67]
   | "c"         [0.33]
  ;

$ pcfg-generate-seq -i1 -n10000 -o seq/12-10k.seq "$prefix/share/qsmm/samples/gram/12.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/12.rg

$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/12_det.rg  \
             --oo=log/12.log td-templ/12.rg seq/12-10k.seq

$ atd-parser -i1 --op=td-learn/12_out.pcfg --oo            \
             --simplify td-learn/12_det.rg seq/12-10k.seq

  [0]: p_td 0.68237916, p_rd 1.00000000, p_wp 0.84284780, p_np 0.84216843, cp 0

$ cat td-learn/12_out.pcfg

  S: "d"           [0.18908451]  // 537   1
   | "i" "k" "j"   [0.13485915]  // 383   1
   | "h" "e" L3_0  [0.67605634]  // 1920
  ;  // 2840

  L3_0: "e" "h" "j"  [0.66458333]  // 1276  1
      | "c"          [0.33541667]  // 644   1
  ;  // 1920

Example 7

$ cat "$prefix/share/qsmm/samples/gram/13.pcfg"

  S: "b" "i" "g"
   | "b" "l" "o" "c" "k"
   | "c" "a" "n"
   | "c" "i" "t" "y"
   | "f" "l" "a" "t" "s"
   | "l" "i" "v" "e"
   | "o" "f"
   | "o" "r"
   | "p" "l" "a" "c" "e"
  ;

$ pcfg-generate-seq -i1 -n200000 -o seq/13-200k.seq            \
                    "$prefix/share/qsmm/samples/gram/13.pcfg"

$ mk-rg-vit.sh 5 5 6 2 >td-templ/13.rg

$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/13_det.rg  \
             --oo=log/13.log td-templ/13.rg seq/13-200k.seq

$ atd-parser -i1 --op=td-learn/13_out.pcfg --oo             \
             --simplify td-learn/13_det.rg seq/13-200k.seq

  [0]: p_td 0.36151913, p_rd 0.70814220, p_wp 0.66418539, p_np 0.66117331, cp 49

$ cat td-learn/13_out.pcfg

  S: "f" "l" "a" "t" "s"  [0.10183488]  // 6166   1
   | "l" "i" "v" "e"      [0.10039802]  // 6079   1
   | "p" "l" "a" "c" "e"  [0.10029893]  // 6073   1
   | "y"                  [0.10021635]  // 6068   1
   | "c" "i" "t"          [0.10021635]  // 6068   0.75748802
   | "o" "f"              [0.1010091]   // 6116   0.75675161
   | "o" "r"              [0.09831707]  // 5953   0.74324839
   | "c" "a" "n"          [0.09823449]  // 5948   0.74251198
   | _S_8T2 L3_3          [0.19947481]  // 12078
  ;  // 60549

  L3_3: "g"          [0.50322901]  // 6078  1
      | "o" "c" "k"  [0.49677099]  // 6000  1
  ;  // 12078

  _S_8T2: "b" "i"  [0.50322901]  // 6078  0.75322888
        | "b" "l"  [0.49677099]  // 6000  0.74677112
  ;  // 12078

Example 8

$ cat "$prefix/share/qsmm/samples/gram/14.pcfg"

  S: "a"
   | "b" "i" "g"
   | "c" "a" "n"
   | "c" "i" "t" "y"
   | "i" "f"
   | "i" "n"
   | "i" "s"
   | "l" "i" "v" "e"
   | "o" "f"
   | "o" "r"
   | "t" "h" "e"
   | "t" "h" "e" "n"
   | "y" "o" "u"
  ;

$ pcfg-generate-seq -i1 -n500000 -o seq/14-500k.seq            \
                    "$prefix/share/qsmm/samples/gram/14.pcfg"

$ mk-rg-vit.sh 4 4 5 3 2 >td-templ/14.rg

$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/14_det.rg  \
             --oo=log/14.log td-templ/14.rg seq/14-500k.seq

$ atd-parser -i1 --op=td-learn/14_out.pcfg --oo             \
             --simplify td-learn/14_det.rg seq/14-500k.seq

  [0]: p_td 0.39224628, p_rd 0.86458056, p_wp 0.59397760, p_np 0.59276837, cp 53

$ cat td-learn/14_out.pcfg

  S: "t" "h" "e"      [0.14164605]  // 28337  1
   | "l" "i" "v" "e"  [0.07155532]  // 14315  1
   | "y" "o" "u"      [0.07155032]  // 14314  1
   | "b" "i" "g"      [0.07103547]  // 14211  1
   | "o" "r"          [0.07153033]  // 14310  0.75098025
   | "a"              [0.07254505]  // 14513  0.75027592
   | "o" "f"          [0.07125041]  // 14254  0.74901975
   | "n"              [0.07140036]  // 14284  0.66639098
   | "i" L3_1         [0.21334133]  // 42680
   | "c" L3_4         [0.14414536]  // 28837
  ;  // 200055

  L3_1: "s"  [0.33348172]  // 14233  1
      | "f"  [0.33479381]  // 14289  0.50145782
      | "n"  [0.33172446]  // 14158  0.41377197
  ;  // 42680

  L3_4: "a" "n"      [0.50216735]  // 14481  1
      | "i" "t" "y"  [0.49783265]  // 14356  1
  ;  // 28837