competition update
This commit is contained in:
2
language_model/srilm-1.7.3/flm/test/Makefile
Normal file
2
language_model/srilm-1.7.3/flm/test/Makefile
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
include $(SRILM)/common/Makefile.test
|
||||
@@ -0,0 +1,76 @@
|
||||
Language Model 0 --------------
|
||||
-- Level 3
|
||||
Node: W-1,M-1,S-1 (0x7), Constraint: W-1 (0x1)
|
||||
1 Children: M-1,S-1 (0x6)
|
||||
-- Level 2
|
||||
Node: M-1,S-1 (0x6), Constraint: M-1,S-1 (0x6)
|
||||
2 Children: S-1 (0x4); M-1 (0x2)
|
||||
-- Level 1
|
||||
Node: M-1 (0x2), Constraint: M-1 (0x2)
|
||||
1 Children: (0x0)
|
||||
Node: S-1 (0x4), Constraint: S-1 (0x4)
|
||||
1 Children: (0x0)
|
||||
-- Level 0
|
||||
Node: (0x0), Constraint: (0x0)
|
||||
0 Children:
|
||||
../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz: line 22292:
|
||||
LM(0) 22292 sentences, 168590 words, 8102 OOVs
|
||||
0 zeroprobs, logprob= 0 ppl= 1 ppl1= 1
|
||||
Mod Kneser-Ney smoothing 0-grams
|
||||
n1 = 583
|
||||
n2 = 2441
|
||||
n3 = 1096
|
||||
n4 = 627
|
||||
D1 = 0.106679
|
||||
D2 = 1.8563
|
||||
D3+ = 2.75588
|
||||
Mod Kneser-Ney smoothing 0x2-grams
|
||||
n1 = 35551
|
||||
n2 = 5234
|
||||
n3 = 1641
|
||||
n4 = 790
|
||||
D1 = 0.772529
|
||||
D2 = 1.27337
|
||||
D3+ = 1.51238
|
||||
Mod Kneser-Ney smoothing 0x4-grams
|
||||
n1 = 50515
|
||||
n2 = 5109
|
||||
n3 = 1321
|
||||
n4 = 555
|
||||
D1 = 0.831755
|
||||
D2 = 1.35482
|
||||
D3+ = 1.6022
|
||||
Mod Kneser-Ney smoothing 0x6-grams
|
||||
n1 = 68172
|
||||
n2 = 1480
|
||||
n3 = 367
|
||||
n4 = 152
|
||||
D1 = 0.958387
|
||||
D2 = 1.28704
|
||||
D3+ = 1.41226
|
||||
Mod Kneser-Ney smoothing 0x7-grams
|
||||
n1 = 54976
|
||||
n2 = 8146
|
||||
n3 = 2670
|
||||
n4 = 1345
|
||||
D1 = 0.771398
|
||||
D2 = 1.24148
|
||||
D3+ = 1.44565
|
||||
warning: distributing 0.216388 left-over probability mass over all 6919 words
|
||||
discarded 1 0x2-gram probs predicting pseudo-events
|
||||
discarded 40785 0x2-gram probs discounted to zero
|
||||
discarded 1 0x4-gram probs predicting pseudo-events
|
||||
discarded 1 0x6-gram probs predicting pseudo-events
|
||||
discarded 70563 0x6-gram probs discounted to zero
|
||||
Finished estimation of multi-child graph-backoff node: LM 0 Node 0x6
|
||||
discarded 1 0x7-gram probs predicting pseudo-events
|
||||
discarded 54976 0x7-gram probs discounted to zero
|
||||
writing FLM to dev.lm.gz
|
||||
writing 6920 0x0-grams
|
||||
writing 0 0x1-grams
|
||||
writing 4387 0x2-grams
|
||||
writing 58528 0x4-grams
|
||||
writing 0 0x3-grams
|
||||
writing 0 0x5-grams
|
||||
writing 0 0x6-grams
|
||||
writing 15858 0x7-grams
|
||||
@@ -0,0 +1,4 @@
|
||||
Perplexity on file: ../fngram-count/ch_lm_dev.noamp.decomposed.txt.gz
|
||||
Language model: 0
|
||||
6003 sentences, 32148 words, 0 OOVs
|
||||
0 zeroprobs, logprob= -80707.8 ppl= 130.462 ppl1= 323.972
|
||||
@@ -0,0 +1,76 @@
|
||||
Language Model 0 --------------
|
||||
-- Level 3
|
||||
Node: W-1,M-1,S-1 (0x7), Constraint: W-1 (0x1)
|
||||
1 Children: M-1,S-1 (0x6)
|
||||
-- Level 2
|
||||
Node: M-1,S-1 (0x6), Constraint: M-1,S-1 (0x6)
|
||||
2 Children: S-1 (0x4); M-1 (0x2)
|
||||
-- Level 1
|
||||
Node: M-1 (0x2), Constraint: M-1 (0x2)
|
||||
1 Children: (0x0)
|
||||
Node: S-1 (0x4), Constraint: S-1 (0x4)
|
||||
1 Children: (0x0)
|
||||
-- Level 0
|
||||
Node: (0x0), Constraint: (0x0)
|
||||
0 Children:
|
||||
ch_lm_train100.noamp.decomposed.txt.gz: line 22292:
|
||||
LM(0) 22292 sentences, 168590 words, 0 OOVs
|
||||
0 zeroprobs, logprob= 0 ppl= 1 ppl1= 1
|
||||
Mod Kneser-Ney smoothing 0-grams
|
||||
n1 = 8685
|
||||
n2 = 2440
|
||||
n3 = 1097
|
||||
n4 = 627
|
||||
D1 = 0.640251
|
||||
D2 = 1.13645
|
||||
D3+ = 1.53624
|
||||
Mod Kneser-Ney smoothing 0x2-grams
|
||||
n1 = 43409
|
||||
n2 = 5684
|
||||
n3 = 1601
|
||||
n4 = 766
|
||||
D1 = 0.792468
|
||||
D2 = 1.33036
|
||||
D3+ = 1.48337
|
||||
Mod Kneser-Ney smoothing 0x4-grams
|
||||
n1 = 58628
|
||||
n2 = 5435
|
||||
n3 = 1239
|
||||
n4 = 516
|
||||
D1 = 0.843593
|
||||
D2 = 1.42307
|
||||
D3+ = 1.59469
|
||||
Mod Kneser-Ney smoothing 0x6-grams
|
||||
n1 = 74228
|
||||
n2 = 2019
|
||||
n3 = 367
|
||||
n4 = 152
|
||||
D1 = 0.948407
|
||||
D2 = 1.48282
|
||||
D3+ = 1.4288
|
||||
Mod Kneser-Ney smoothing 0x7-grams
|
||||
n1 = 62432
|
||||
n2 = 7715
|
||||
n3 = 2519
|
||||
n4 = 1272
|
||||
D1 = 0.801829
|
||||
D2 = 1.21459
|
||||
D3+ = 1.38043
|
||||
warning: distributing 0.18495 left-over probability mass over 1 zeroton words
|
||||
discarded 1 0x2-gram probs predicting pseudo-events
|
||||
discarded 49093 0x2-gram probs discounted to zero
|
||||
discarded 1 0x4-gram probs predicting pseudo-events
|
||||
discarded 1 0x6-gram probs predicting pseudo-events
|
||||
discarded 77157 0x6-gram probs discounted to zero
|
||||
Finished estimation of multi-child graph-backoff node: LM 0 Node 0x6
|
||||
discarded 1 0x7-gram probs predicting pseudo-events
|
||||
discarded 62432 0x7-gram probs discounted to zero
|
||||
writing FLM to dev.lm.gz
|
||||
writing 15022 0x0-grams
|
||||
writing 0 0x1-grams
|
||||
writing 4225 0x2-grams
|
||||
writing 66753 0x4-grams
|
||||
writing 0 0x3-grams
|
||||
writing 0 0x5-grams
|
||||
writing 0 0x6-grams
|
||||
writing 14979 0x7-grams
|
||||
@@ -0,0 +1,4 @@
|
||||
Perplexity on file: ch_lm_dev.noamp.decomposed.txt.gz
|
||||
Language model: 0
|
||||
6003 sentences, 32148 words, 0 OOVs
|
||||
0 zeroprobs, logprob= -84967 ppl= 168.703 ppl1= 439.536
|
||||
@@ -0,0 +1,61 @@
|
||||
Language Model 0 --------------
|
||||
-- Level 3
|
||||
Node: W-1,M-1,S-1 (0x7), Constraint: W-1 (0x1)
|
||||
1 Children: M-1,S-1 (0x6)
|
||||
-- Level 2
|
||||
Node: M-1,S-1 (0x6), Constraint: S-1 (0x4)
|
||||
1 Children: M-1 (0x2)
|
||||
-- Level 1
|
||||
Node: M-1 (0x2), Constraint: M-1 (0x2)
|
||||
1 Children: (0x0)
|
||||
-- Level 0
|
||||
Node: (0x0), Constraint: (0x0)
|
||||
0 Children:
|
||||
../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz: line 22292:
|
||||
LM(0) 22292 sentences, 168590 words, 0 OOVs
|
||||
0 zeroprobs, logprob= 0 ppl= 1 ppl1= 1
|
||||
Mod Kneser-Ney smoothing 0-grams
|
||||
n1 = 8772
|
||||
n2 = 2477
|
||||
n3 = 1101
|
||||
n4 = 627
|
||||
D1 = 0.639079
|
||||
D2 = 1.14781
|
||||
D3+ = 1.54422
|
||||
Mod Kneser-Ney smoothing 0x2-grams
|
||||
n1 = 43482
|
||||
n2 = 5646
|
||||
n3 = 1585
|
||||
n4 = 755
|
||||
D1 = 0.793844
|
||||
D2 = 1.33143
|
||||
D3+ = 1.48744
|
||||
Mod Kneser-Ney smoothing 0x6-grams
|
||||
n1 = 74228
|
||||
n2 = 2019
|
||||
n3 = 367
|
||||
n4 = 152
|
||||
D1 = 0.948407
|
||||
D2 = 1.48282
|
||||
D3+ = 1.4288
|
||||
Mod Kneser-Ney smoothing 0x7-grams
|
||||
n1 = 62432
|
||||
n2 = 7715
|
||||
n3 = 2519
|
||||
n4 = 1272
|
||||
D1 = 0.801829
|
||||
D2 = 1.21459
|
||||
D3+ = 1.38043
|
||||
warning: distributing 0.267684 left-over probability mass over 1 zeroton words
|
||||
discarded 43482 0x2-gram probs discounted to zero
|
||||
discarded 74228 0x6-gram probs discounted to zero
|
||||
discarded 62432 0x7-gram probs discounted to zero
|
||||
writing FLM to dev.lm.gz
|
||||
writing 15022 0x0-grams
|
||||
writing 0 0x1-grams
|
||||
writing 9836 0x2-grams
|
||||
writing 0 0x4-grams
|
||||
writing 0 0x3-grams
|
||||
writing 0 0x5-grams
|
||||
writing 2929 0x6-grams
|
||||
writing 14979 0x7-grams
|
||||
@@ -0,0 +1,2 @@
|
||||
file ../fngram-count/ch_lm_dev.noamp.decomposed.txt.gz: 6003 sentences, 32148 words, 0 OOVs
|
||||
0 zeroprobs, logprob= -86223.3 ppl= 181.992 ppl1= 480.92
|
||||
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
gunzip -c -f ../fngram-count/ch_lm_train100.noamp.decomposed.txt | \
|
||||
sed 's,:, ,g' | \
|
||||
ngram-count -text - -write1 - -sort | \
|
||||
${GAWK-gawk} '$2 > 1 { print $1 }' > train-gt1.vocab
|
||||
28
language_model/srilm-1.7.3/flm/test/tests/fngram-count-vocab/run-test
Executable file
28
language_model/srilm-1.7.3/flm/test/tests/fngram-count-vocab/run-test
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/bin/sh
|
||||
|
||||
if [ -f ../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz ]; then
|
||||
gz=.gz
|
||||
else
|
||||
gz=
|
||||
sed 's,\.gz,,g' ../fngram-count/test.flm > ../fngram-count/test.flm.nogz && \
|
||||
mv ../fngram-count/test.flm ../fngram-count/test.flm-gz && \
|
||||
mv ../fngram-count/test.flm.nogz ../fngram-count/test.flm
|
||||
fi
|
||||
|
||||
./go.make-vocab
|
||||
|
||||
fngram-count \
|
||||
-debug 2 -factor-file ../fngram-count/test.flm \
|
||||
-vocab train-gt1.vocab \
|
||||
-text ../fngram-count/ch_lm_train100.noamp.decomposed.txt$gz \
|
||||
-write-counts -lm -unk -nonull
|
||||
|
||||
fngram \
|
||||
-debug 0 -factor-file ../fngram-count/test.flm \
|
||||
-ppl ../fngram-count/ch_lm_dev.noamp.decomposed.txt$gz -unk -nonull
|
||||
|
||||
rm -f dev.count$gz dev.lm$gz train-gt1.vocab
|
||||
|
||||
if [ -f ../fngram-count/test.flm-gz ]; then
|
||||
mv ../fngram-count/test.flm-gz ../fngram-count/test.flm
|
||||
fi
|
||||
Binary file not shown.
Binary file not shown.
21
language_model/srilm-1.7.3/flm/test/tests/fngram-count/run-test
Executable file
21
language_model/srilm-1.7.3/flm/test/tests/fngram-count/run-test
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/bin/sh
|
||||
|
||||
if [ -f ch_lm_train100.noamp.decomposed.txt.gz ]; then
|
||||
gz=.gz
|
||||
else
|
||||
gz=
|
||||
sed 's,\.gz,,g' test.flm > test.flm.nogz && \
|
||||
mv test.flm test.flm-gz && \
|
||||
mv test.flm.nogz test.flm
|
||||
fi
|
||||
|
||||
fngram-count -debug 2 -factor-file test.flm -text ch_lm_train100.noamp.decomposed.txt$gz -write-counts -lm -unk -nonull
|
||||
|
||||
fngram -debug 0 -factor-file test.flm -ppl ch_lm_dev.noamp.decomposed.txt$gz -unk -nonull
|
||||
|
||||
rm -f dev.count$gz dev.lm$gz
|
||||
|
||||
if [ -f test.flm-gz ]; then
|
||||
mv test.flm-gz test.flm
|
||||
fi
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
##
|
||||
##
|
||||
##
|
||||
## Factord Language Model File:
|
||||
##
|
||||
## Current set of Node Options
|
||||
##
|
||||
## gtmin [num]
|
||||
## gtmax [num]
|
||||
## gt [fileName string]
|
||||
## cdiscount [double]
|
||||
## ndiscount []
|
||||
## wbdiscount []
|
||||
## kndiscount []
|
||||
## ukndiscount []
|
||||
## kn-counts-modified []
|
||||
## kn-counts-modify-at-end []
|
||||
## kn [fileName string]
|
||||
## kn-count-parent [parent spec]
|
||||
## interpolate []
|
||||
## write [fileName string]
|
||||
##
|
||||
## ## The next set of options are active only when there
|
||||
## ## are multiple backoff paths (backoff-graph children) possible.
|
||||
##
|
||||
## strategy [option]
|
||||
## where [option] is one of:
|
||||
## counts_no_norm
|
||||
## counts_sum_counts_norm <default>
|
||||
## counts_sum_num_words_norm
|
||||
## counts_prod_card_norm
|
||||
## counts_sum_card_norm
|
||||
## counts_sum_log_card_norm
|
||||
## bog_node_prob
|
||||
## combine [option]
|
||||
## where [option] is one of:
|
||||
## max <default>
|
||||
## min
|
||||
## sum
|
||||
## avg||mean
|
||||
## prod
|
||||
## gmean
|
||||
## wmean { <node_spec weight> <node_spec weight> ... }
|
||||
##
|
||||
##
|
||||
##
|
||||
## Factors that are currently available in the files in this directory:
|
||||
##
|
||||
## W - word (about 14k)
|
||||
## M - morphological class (about 1279)
|
||||
## S - stem (about 5281)
|
||||
## R - root (about 3346)
|
||||
## P - pattern (about 1516)
|
||||
##
|
||||
|
||||
1
|
||||
|
||||
## bigram w. general backoff that gets better than trigram.
|
||||
## logprob= -84967 ppl= 168.703 ppl1= 439.536
|
||||
W : 3 W(-1) M(-1) S(-1) dev.count.gz dev.lm.gz 5
|
||||
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
|
||||
M1,S1 S1,M1 kndiscount gtmin 100000000 combine mean
|
||||
M1 M1 kndiscount gtmin 3 kn-count-parent W1,M1,S1
|
||||
S1 S1 kndiscount gtmin 1 kn-count-parent W1,M1,S1
|
||||
0 0 kndiscount gtmin 1 kn-count-parent W1,M1,S1
|
||||
|
||||
|
||||
## logprob= -84709 ppl= 166.097 ppl1= 431.488
|
||||
W : 4 W(-1) W(-2) M(-1) S(-1) w_g_w1w2m1s1.count.gz w_g_w1w2m1s1.lm.gz 6
|
||||
W1,W2,M1,S1 W2 kndiscount gtmin 3 interpolate
|
||||
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
|
||||
M1,S1 S1,M1 kndiscount gtmin 100000000 combine max strategy bog_node_prob
|
||||
M1 M1 kndiscount gtmin 3 kn-count-parent W1,M1,S1
|
||||
S1 S1 kndiscount gtmin 1 kn-count-parent W1,M1,S1
|
||||
0 0 kndiscount gtmin 1 kn-count-parent W1,M1,S1
|
||||
|
||||
## 0 zeroprobs, logprob= -86272.5 ppl= 182.533 ppl1= 482.617
|
||||
W : 4 W(-1) W(-2) M(-1) S(-1) w_g_w1w2m1s1.count.gz w_g_w1w2m1s1.lm.gz 5
|
||||
W1,W2,M1,S1 W2 kndiscount gtmin 3 interpolate
|
||||
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
|
||||
M1,S1 M1 kndiscount gtmin 1 interpolate
|
||||
S1 S1 kndiscount gtmin 1 interpolate
|
||||
0 0 kndiscount gtmin 1
|
||||
|
||||
23
language_model/srilm-1.7.3/flm/test/tests/ngram-factored/run-test
Executable file
23
language_model/srilm-1.7.3/flm/test/tests/ngram-factored/run-test
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/sh
|
||||
|
||||
flags="-no-virtual-begin-sentence -no-virtual-end-sentence"
|
||||
|
||||
if [ -f ../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz ]; then
|
||||
gz=.gz
|
||||
else
|
||||
gz=
|
||||
sed 's,\.gz,,g' test.flm > test.flm.nogz && \
|
||||
mv test.flm test.flm-gz && \
|
||||
mv test.flm.nogz test.flm
|
||||
fi
|
||||
|
||||
fngram-count $flags -debug 2 -factor-file test.flm -text ../fngram-count/ch_lm_train100.noamp.decomposed.txt$gz -write-counts -lm -unk -nonull
|
||||
|
||||
ngram -debug 0 -order 2 -factored -lm test.flm -ppl ../fngram-count/ch_lm_dev.noamp.decomposed.txt$gz -unk -nonull
|
||||
|
||||
rm -f dev.count$gz dev.lm$gz
|
||||
|
||||
if [ -f test.flm-gz ]; then
|
||||
mv test.flm-gz test.flm
|
||||
fi
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
##
|
||||
##
|
||||
##
|
||||
## Factord Language Model File:
|
||||
##
|
||||
## Current set of Node Options
|
||||
##
|
||||
## gtmin [num]
|
||||
## gtmax [num]
|
||||
## gt [fileName string]
|
||||
## cdiscount [double]
|
||||
## ndiscount []
|
||||
## wbdiscount []
|
||||
## kndiscount []
|
||||
## ukndiscount []
|
||||
## kn-counts-modified []
|
||||
## kn-counts-modify-at-end []
|
||||
## kn [fileName string]
|
||||
## kn-count-parent [parent spec]
|
||||
## interpolate []
|
||||
## write [fileName string]
|
||||
##
|
||||
## ## The next set of options are active only when there
|
||||
## ## are multiple backoff paths (backoff-graph children) possible.
|
||||
##
|
||||
## strategy [option]
|
||||
## where [option] is one of:
|
||||
## counts_no_norm
|
||||
## counts_sum_counts_norm <default>
|
||||
## counts_sum_num_words_norm
|
||||
## counts_prod_card_norm
|
||||
## counts_sum_card_norm
|
||||
## counts_sum_log_card_norm
|
||||
## bog_node_prob
|
||||
## combine [option]
|
||||
## where [option] is one of:
|
||||
## max <default>
|
||||
## min
|
||||
## sum
|
||||
## avg||mean
|
||||
## prod
|
||||
## gmean
|
||||
## wmean { <node_spec weight> <node_spec weight> ... }
|
||||
##
|
||||
##
|
||||
##
|
||||
## Factors that are currently available in the files in this directory:
|
||||
##
|
||||
## W - word (about 14k)
|
||||
## M - morphological class (about 1279)
|
||||
## S - stem (about 5281)
|
||||
## R - root (about 3346)
|
||||
## P - pattern (about 1516)
|
||||
##
|
||||
|
||||
1
|
||||
|
||||
## bigram w. general backoff that gets better than trigram.
|
||||
## logprob= -84967 ppl= 168.703 ppl1= 439.536
|
||||
W : 3 W(-1) M(-1) S(-1) dev.count.gz dev.lm.gz 5
|
||||
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
|
||||
M1,S1 S1 kndiscount gtmin 2 interpolate
|
||||
M1 M1 kndiscount gtmin 2 interpolate
|
||||
S1 S1 kndiscount gtmin 1 interpolate
|
||||
0 0 kndiscount gtmin 1
|
||||
|
||||
## bigram w. general backoff that gets better than trigram.
|
||||
## logprob= -84967 ppl= 168.703 ppl1= 439.536
|
||||
W : 3 W(-1) M(-1) S(-1) dev.count.gz dev.lm.gz 5
|
||||
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
|
||||
M1,S1 S1,M1 kndiscount gtmin 100000000 combine mean
|
||||
M1 M1 kndiscount gtmin 3 kn-count-parent W1,M1,S1
|
||||
S1 S1 kndiscount gtmin 1 kn-count-parent W1,M1,S1
|
||||
0 0 kndiscount gtmin 1 kn-count-parent W1,M1,S1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user