This subsection contains the examples of learning simple grammars for dividing terminal symbol sequences into words.
The examples use grammar files located in the directory $prefix/share/qsmm/samples/gram (installed from the directory samples/gram in the package distribution) and auxiliary programs described in pcfg-generate-seq and mk-rg-vit.sh.
To reproduce the examples, execute the following preparation commands in a temporary directory:
$ mkdir seq log td-templ td-learn $ prefix=/usr # use a different prefix, if necessary
Continue by executing commands for the examples below.
$ cat "$prefix/share/qsmm/samples/gram/3.pcfg" S: A B ; A: "a" "b" "c" | "d" "c" "b" "a" ; B: "e" | "f" "e" ;
$ pcfg-generate-seq -i1 -n2000 -o seq/3-2k.seq "$prefix/share/qsmm/samples/gram/3.pcfg" $ mk-rg-vit.sh 6 6 2 >td-templ/3.rg
$ atd-parser -i1 -n20000 --det-niter-goal=50 --od=td-learn/3_det.rg \
--oo=log/3.log td-templ/3.rg seq/3-2k.seq
$ atd-parser -i1 --op=td-learn/3_out.pcfg --oo \
--simplify td-learn/3_det.rg seq/3-2k.seq
[0]: p_td 0.70731857, p_rd 1.00000000, p_wp 0.81804014, p_np 0.79479479, cp 0
$ cat td-learn/3_out.pcfg
S: "d" "c" "b" "a" L2_1 [0.51629073] // 206
| "a" "b" "c" L3_1 [0.48370927] // 193
; // 399
L2_1: "f" "e" [0.47572816] // 98 0.75
| "e" [0.52427184] // 108 0.63533835
; // 206
L3_1: "f" "e" [0.50777202] // 98 0.75
| "e" [0.49222798] // 95 0.61904762
; // 193
$ cat "$prefix/share/qsmm/samples/gram/5.pcfg"
S: B C
| B C C
| D C B
| D D C
;
B: "a" "a"
;
C: "c" "b" "c"
;
D: "d" "b" "b" "d"
;
$ pcfg-generate-seq -i1 -n10000 -o seq/5-10k.seq "$prefix/share/qsmm/samples/gram/5.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/5.rg
$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/5_det.rg \
--oo=log/5.log td-templ/5.rg seq/5-10k.seq
$ atd-parser -i1 --op=td-learn/5_out.pcfg --oo \
--simplify td-learn/5_det.rg seq/5-10k.seq
[0]: p_td 0.58834791, p_rd 1.00000000, p_wp 0.82003105, p_np 0.81757230, cp 0
$ cat td-learn/5_out.pcfg S: "a" [0.42766108] // 1812 1 | "c" "b" "c" [0.35850838] // 1519 1 | "d" "b" "b" "d" [0.21383054] // 906 1 ; // 4237
$ cat "$prefix/share/qsmm/samples/gram/6.pcfg" S: B C | B D D | B E E E ;
B: "a" "b" | "b" "b" ; C: "a" "c" | "c" "c" ; D: "a" "d" | "d" "d" ;
E: "a" "e" | "e" "e" ;
$ pcfg-generate-seq -i1 -n40000 -o seq/6-40k.seq "$prefix/share/qsmm/samples/gram/6.pcfg"
$ cat "$prefix/share/qsmm/samples/gram/6.rg"
S: ( . .
| . .
) ( . .
| . . ( . . | . . )
| . . ( . . | . . ) ( . . | . . )
| . ( .
| . ( . . | . . )
| . ( . . | . . ) ( . . | . . )
)
)
;
$ atd-parser -i1 -n80000 --det-niter-goal=50 --od=td-learn/6_det.rg \
--oo=log/6.log "$prefix/share/qsmm/samples/gram/6.rg" seq/6-40k.seq
$ atd-parser -i1 --or=td-learn/6_resi.rg --oo td-learn/6_det.rg seq/6-40k.seq
[0]: p_td 0.67223945, p_rd 0.96721932, p_wp 0.70524366, p_np 0.69389439, cp 108
$ cat td-learn/6_resi.rg
S: ( "b" "b"
| "a" "b"
) ( "a" ( "c"
| "d" ( "d" "d"
| "a" "d"
)
| "e" ( "e" "e"
| "a" "e"
) ( "e" "e"
| "a" "e"
)
)
| "d" "d" ( "a" "d"
| "d" "d"
)
| "e" "e" ( "e" "e"
| "a" "e"
) ( "a" "e"
| "e" "e"
)
| "c" "c"
)
;
$ cat "$prefix/share/qsmm/samples/gram/9.pcfg" S: "l" "a" "z" "y" | "f" "o" "x" | "f" "o" "x" "y" ;
$ pcfg-generate-seq -i1 -n5000 -o seq/9-5k.seq "$prefix/share/qsmm/samples/gram/9.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/9.rg
$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/9_det.rg \
--oo=log/9.log td-templ/9.rg seq/9-5k.seq
$ atd-parser -i1 --op=td-learn/9_out.pcfg --oo \
--simplify td-learn/9_det.rg seq/9-5k.seq
[0]: p_td 0.56923116, p_rd 0.77034971, p_wp 0.84928628, p_np 0.84733894, cp 25
$ cat td-learn/9_out.pcfg
S: "y" "f" "o" "x" [0.33406917] // 454 0.96864589
| "f" "o" "x" [0.32303164] // 439 0.73741149
| "y" "l" "a" "z" [0.15820456] // 215 0.50772377
| "l" "a" "z" "y" [0.18469463] // 251
; // 1359
$ cat "$prefix/share/qsmm/samples/gram/10.pcfg" S: "c" "i" "l" "i" | "c" "i" "r" "c" | "c" "i" "t" "i" | "l" "i" "b" "e" | "l" "i" "f" "e" | "l" "i" "v" "e" | "v" "i" "l" "l" | "v" "i" "r" "u" | "v" "i" "v" "o" ;
$ pcfg-generate-seq -i1 -n40000 -o seq/10-40k.seq "$prefix/share/qsmm/samples/gram/10.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/10.rg
$ atd-parser -i1 -n80000 --det-niter-goal=50 --od=td-learn/10_det.rg \
--oo=log/10.log td-templ/10.rg seq/10-40k.seq
$ atd-parser -i1 --op=td-learn/10_out.pcfg --oo \
--simplify td-learn/10_det.rg seq/10-40k.seq
[0]: p_td 0.55845396, p_rd 0.82352984, p_wp 0.67576938, p_np 0.66497500, cp 48
$ cat td-learn/10_out.pcfg
S: "c" "i" "t" "i" [0.1132] // 1132 0.67919697
| "c" "i" "l" "i" [0.1121] // 1121 0.67233259
| "c" "i" "r" "c" [0.108] // 1080 0.64847044
| "l" "i" L3_2 [0.3334] // 3334
| "v" "i" L3_0 [0.3333] // 3333
; // 10000
L3_0: "v" "o" [0.3360336] // 1120 1
| "r" "u" [0.33183318] // 1106 1
| "l" "l" [0.33213321] // 1107
; // 3333
L3_2: "f" "e" [0.34523095] // 1151 1
| "v" "e" [0.33023395] // 1101 1
| "b" "e" [0.32453509] // 1082
; // 3334
$ cat "$prefix/share/qsmm/samples/gram/12.pcfg"
W: "h" "e" A [0.67]
| "d" [0.20]
| "i" "k" "j" [0.13]
;
A: "e" "h" "j" [0.67]
| "c" [0.33]
;
$ pcfg-generate-seq -i1 -n10000 -o seq/12-10k.seq "$prefix/share/qsmm/samples/gram/12.pcfg"
$ mk-rg-vit.sh 5 5 3 2 >td-templ/12.rg
$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/12_det.rg \
--oo=log/12.log td-templ/12.rg seq/12-10k.seq
$ atd-parser -i1 --op=td-learn/12_out.pcfg --oo \
--simplify td-learn/12_det.rg seq/12-10k.seq
[0]: p_td 0.68237916, p_rd 1.00000000, p_wp 0.84284780, p_np 0.84216843, cp 0
$ cat td-learn/12_out.pcfg
S: "d" [0.18908451] // 537 1
| "i" "k" "j" [0.13485915] // 383 1
| "h" "e" L3_0 [0.67605634] // 1920
; // 2840
L3_0: "e" "h" "j" [0.66458333] // 1276 1
| "c" [0.33541667] // 644 1
; // 1920
$ cat "$prefix/share/qsmm/samples/gram/13.pcfg"
S: "b" "i" "g"
| "b" "l" "o" "c" "k"
| "c" "a" "n"
| "c" "i" "t" "y"
| "f" "l" "a" "t" "s"
| "l" "i" "v" "e"
| "o" "f"
| "o" "r"
| "p" "l" "a" "c" "e"
;
$ pcfg-generate-seq -i1 -n200000 -o seq/13-200k.seq \
"$prefix/share/qsmm/samples/gram/13.pcfg"
$ mk-rg-vit.sh 5 5 6 2 >td-templ/13.rg
$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/13_det.rg \
--oo=log/13.log td-templ/13.rg seq/13-200k.seq
$ atd-parser -i1 --op=td-learn/13_out.pcfg --oo \
--simplify td-learn/13_det.rg seq/13-200k.seq
[0]: p_td 0.36151913, p_rd 0.70814220, p_wp 0.66418539, p_np 0.66117331, cp 49
$ cat td-learn/13_out.pcfg
S: "f" "l" "a" "t" "s" [0.10183488] // 6166 1
| "l" "i" "v" "e" [0.10039802] // 6079 1
| "p" "l" "a" "c" "e" [0.10029893] // 6073 1
| "y" [0.10021635] // 6068 1
| "c" "i" "t" [0.10021635] // 6068 0.75748802
| "o" "f" [0.1010091] // 6116 0.75675161
| "o" "r" [0.09831707] // 5953 0.74324839
| "c" "a" "n" [0.09823449] // 5948 0.74251198
| _S_8T2 L3_3 [0.19947481] // 12078
; // 60549
L3_3: "g" [0.50322901] // 6078 1
| "o" "c" "k" [0.49677099] // 6000 1
; // 12078
_S_8T2: "b" "i" [0.50322901] // 6078 0.75322888
| "b" "l" [0.49677099] // 6000 0.74677112
; // 12078
$ cat "$prefix/share/qsmm/samples/gram/14.pcfg" S: "a" | "b" "i" "g" | "c" "a" "n" | "c" "i" "t" "y" | "i" "f" | "i" "n" | "i" "s" | "l" "i" "v" "e" | "o" "f" | "o" "r" | "t" "h" "e" | "t" "h" "e" "n" | "y" "o" "u" ;
$ pcfg-generate-seq -i1 -n500000 -o seq/14-500k.seq \
"$prefix/share/qsmm/samples/gram/14.pcfg"
$ mk-rg-vit.sh 4 4 5 3 2 >td-templ/14.rg
$ atd-parser -i1 --det-niter-goal=50 --od=td-learn/14_det.rg \
--oo=log/14.log td-templ/14.rg seq/14-500k.seq
$ atd-parser -i1 --op=td-learn/14_out.pcfg --oo \
--simplify td-learn/14_det.rg seq/14-500k.seq
[0]: p_td 0.39224628, p_rd 0.86458056, p_wp 0.59397760, p_np 0.59276837, cp 53
$ cat td-learn/14_out.pcfg
S: "t" "h" "e" [0.14164605] // 28337 1
| "l" "i" "v" "e" [0.07155532] // 14315 1
| "y" "o" "u" [0.07155032] // 14314 1
| "b" "i" "g" [0.07103547] // 14211 1
| "o" "r" [0.07153033] // 14310 0.75098025
| "a" [0.07254505] // 14513 0.75027592
| "o" "f" [0.07125041] // 14254 0.74901975
| "n" [0.07140036] // 14284 0.66639098
| "i" L3_1 [0.21334133] // 42680
| "c" L3_4 [0.14414536] // 28837
; // 200055
L3_1: "s" [0.33348172] // 14233 1
| "f" [0.33479381] // 14289 0.50145782
| "n" [0.33172446] // 14158 0.41377197
; // 42680
L3_4: "a" "n" [0.50216735] // 14481 1
| "i" "t" "y" [0.49783265] // 14356 1
; // 28837