competition update

This commit is contained in:
nckcard
2025-07-02 12:18:09 -07:00
parent 9e17716a4a
commit 77dbcf868f
2615 changed files with 1648116 additions and 125 deletions

View File

@@ -0,0 +1,2 @@
include $(SRILM)/common/Makefile.test

View File

@@ -0,0 +1,76 @@
Language Model 0 --------------
-- Level 3
Node: W-1,M-1,S-1 (0x7), Constraint: W-1 (0x1)
1 Children: M-1,S-1 (0x6)
-- Level 2
Node: M-1,S-1 (0x6), Constraint: M-1,S-1 (0x6)
2 Children: S-1 (0x4); M-1 (0x2)
-- Level 1
Node: M-1 (0x2), Constraint: M-1 (0x2)
1 Children: (0x0)
Node: S-1 (0x4), Constraint: S-1 (0x4)
1 Children: (0x0)
-- Level 0
Node: (0x0), Constraint: (0x0)
0 Children:
../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz: line 22292:
LM(0) 22292 sentences, 168590 words, 8102 OOVs
0 zeroprobs, logprob= 0 ppl= 1 ppl1= 1
Mod Kneser-Ney smoothing 0-grams
n1 = 583
n2 = 2441
n3 = 1096
n4 = 627
D1 = 0.106679
D2 = 1.8563
D3+ = 2.75588
Mod Kneser-Ney smoothing 0x2-grams
n1 = 35551
n2 = 5234
n3 = 1641
n4 = 790
D1 = 0.772529
D2 = 1.27337
D3+ = 1.51238
Mod Kneser-Ney smoothing 0x4-grams
n1 = 50515
n2 = 5109
n3 = 1321
n4 = 555
D1 = 0.831755
D2 = 1.35482
D3+ = 1.6022
Mod Kneser-Ney smoothing 0x6-grams
n1 = 68172
n2 = 1480
n3 = 367
n4 = 152
D1 = 0.958387
D2 = 1.28704
D3+ = 1.41226
Mod Kneser-Ney smoothing 0x7-grams
n1 = 54976
n2 = 8146
n3 = 2670
n4 = 1345
D1 = 0.771398
D2 = 1.24148
D3+ = 1.44565
warning: distributing 0.216388 left-over probability mass over all 6919 words
discarded 1 0x2-gram probs predicting pseudo-events
discarded 40785 0x2-gram probs discounted to zero
discarded 1 0x4-gram probs predicting pseudo-events
discarded 1 0x6-gram probs predicting pseudo-events
discarded 70563 0x6-gram probs discounted to zero
Finished estimation of multi-child graph-backoff node: LM 0 Node 0x6
discarded 1 0x7-gram probs predicting pseudo-events
discarded 54976 0x7-gram probs discounted to zero
writing FLM to dev.lm.gz
writing 6920 0x0-grams
writing 0 0x1-grams
writing 4387 0x2-grams
writing 58528 0x4-grams
writing 0 0x3-grams
writing 0 0x5-grams
writing 0 0x6-grams
writing 15858 0x7-grams

View File

@@ -0,0 +1,4 @@
Perplexity on file: ../fngram-count/ch_lm_dev.noamp.decomposed.txt.gz
Language model: 0
6003 sentences, 32148 words, 0 OOVs
0 zeroprobs, logprob= -80707.8 ppl= 130.462 ppl1= 323.972

View File

@@ -0,0 +1,76 @@
Language Model 0 --------------
-- Level 3
Node: W-1,M-1,S-1 (0x7), Constraint: W-1 (0x1)
1 Children: M-1,S-1 (0x6)
-- Level 2
Node: M-1,S-1 (0x6), Constraint: M-1,S-1 (0x6)
2 Children: S-1 (0x4); M-1 (0x2)
-- Level 1
Node: M-1 (0x2), Constraint: M-1 (0x2)
1 Children: (0x0)
Node: S-1 (0x4), Constraint: S-1 (0x4)
1 Children: (0x0)
-- Level 0
Node: (0x0), Constraint: (0x0)
0 Children:
ch_lm_train100.noamp.decomposed.txt.gz: line 22292:
LM(0) 22292 sentences, 168590 words, 0 OOVs
0 zeroprobs, logprob= 0 ppl= 1 ppl1= 1
Mod Kneser-Ney smoothing 0-grams
n1 = 8685
n2 = 2440
n3 = 1097
n4 = 627
D1 = 0.640251
D2 = 1.13645
D3+ = 1.53624
Mod Kneser-Ney smoothing 0x2-grams
n1 = 43409
n2 = 5684
n3 = 1601
n4 = 766
D1 = 0.792468
D2 = 1.33036
D3+ = 1.48337
Mod Kneser-Ney smoothing 0x4-grams
n1 = 58628
n2 = 5435
n3 = 1239
n4 = 516
D1 = 0.843593
D2 = 1.42307
D3+ = 1.59469
Mod Kneser-Ney smoothing 0x6-grams
n1 = 74228
n2 = 2019
n3 = 367
n4 = 152
D1 = 0.948407
D2 = 1.48282
D3+ = 1.4288
Mod Kneser-Ney smoothing 0x7-grams
n1 = 62432
n2 = 7715
n3 = 2519
n4 = 1272
D1 = 0.801829
D2 = 1.21459
D3+ = 1.38043
warning: distributing 0.18495 left-over probability mass over 1 zeroton words
discarded 1 0x2-gram probs predicting pseudo-events
discarded 49093 0x2-gram probs discounted to zero
discarded 1 0x4-gram probs predicting pseudo-events
discarded 1 0x6-gram probs predicting pseudo-events
discarded 77157 0x6-gram probs discounted to zero
Finished estimation of multi-child graph-backoff node: LM 0 Node 0x6
discarded 1 0x7-gram probs predicting pseudo-events
discarded 62432 0x7-gram probs discounted to zero
writing FLM to dev.lm.gz
writing 15022 0x0-grams
writing 0 0x1-grams
writing 4225 0x2-grams
writing 66753 0x4-grams
writing 0 0x3-grams
writing 0 0x5-grams
writing 0 0x6-grams
writing 14979 0x7-grams

View File

@@ -0,0 +1,4 @@
Perplexity on file: ch_lm_dev.noamp.decomposed.txt.gz
Language model: 0
6003 sentences, 32148 words, 0 OOVs
0 zeroprobs, logprob= -84967 ppl= 168.703 ppl1= 439.536

View File

@@ -0,0 +1,61 @@
Language Model 0 --------------
-- Level 3
Node: W-1,M-1,S-1 (0x7), Constraint: W-1 (0x1)
1 Children: M-1,S-1 (0x6)
-- Level 2
Node: M-1,S-1 (0x6), Constraint: S-1 (0x4)
1 Children: M-1 (0x2)
-- Level 1
Node: M-1 (0x2), Constraint: M-1 (0x2)
1 Children: (0x0)
-- Level 0
Node: (0x0), Constraint: (0x0)
0 Children:
../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz: line 22292:
LM(0) 22292 sentences, 168590 words, 0 OOVs
0 zeroprobs, logprob= 0 ppl= 1 ppl1= 1
Mod Kneser-Ney smoothing 0-grams
n1 = 8772
n2 = 2477
n3 = 1101
n4 = 627
D1 = 0.639079
D2 = 1.14781
D3+ = 1.54422
Mod Kneser-Ney smoothing 0x2-grams
n1 = 43482
n2 = 5646
n3 = 1585
n4 = 755
D1 = 0.793844
D2 = 1.33143
D3+ = 1.48744
Mod Kneser-Ney smoothing 0x6-grams
n1 = 74228
n2 = 2019
n3 = 367
n4 = 152
D1 = 0.948407
D2 = 1.48282
D3+ = 1.4288
Mod Kneser-Ney smoothing 0x7-grams
n1 = 62432
n2 = 7715
n3 = 2519
n4 = 1272
D1 = 0.801829
D2 = 1.21459
D3+ = 1.38043
warning: distributing 0.267684 left-over probability mass over 1 zeroton words
discarded 43482 0x2-gram probs discounted to zero
discarded 74228 0x6-gram probs discounted to zero
discarded 62432 0x7-gram probs discounted to zero
writing FLM to dev.lm.gz
writing 15022 0x0-grams
writing 0 0x1-grams
writing 9836 0x2-grams
writing 0 0x4-grams
writing 0 0x3-grams
writing 0 0x5-grams
writing 2929 0x6-grams
writing 14979 0x7-grams

View File

@@ -0,0 +1,2 @@
file ../fngram-count/ch_lm_dev.noamp.decomposed.txt.gz: 6003 sentences, 32148 words, 0 OOVs
0 zeroprobs, logprob= -86223.3 ppl= 181.992 ppl1= 480.92

View File

@@ -0,0 +1,6 @@
#!/bin/sh
gunzip -c -f ../fngram-count/ch_lm_train100.noamp.decomposed.txt | \
sed 's,:, ,g' | \
ngram-count -text - -write1 - -sort | \
${GAWK-gawk} '$2 > 1 { print $1 }' > train-gt1.vocab

View File

@@ -0,0 +1,28 @@
#!/bin/sh
if [ -f ../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz ]; then
gz=.gz
else
gz=
sed 's,\.gz,,g' ../fngram-count/test.flm > ../fngram-count/test.flm.nogz && \
mv ../fngram-count/test.flm ../fngram-count/test.flm-gz && \
mv ../fngram-count/test.flm.nogz ../fngram-count/test.flm
fi
./go.make-vocab
fngram-count \
-debug 2 -factor-file ../fngram-count/test.flm \
-vocab train-gt1.vocab \
-text ../fngram-count/ch_lm_train100.noamp.decomposed.txt$gz \
-write-counts -lm -unk -nonull
fngram \
-debug 0 -factor-file ../fngram-count/test.flm \
-ppl ../fngram-count/ch_lm_dev.noamp.decomposed.txt$gz -unk -nonull
rm -f dev.count$gz dev.lm$gz train-gt1.vocab
if [ -f ../fngram-count/test.flm-gz ]; then
mv ../fngram-count/test.flm-gz ../fngram-count/test.flm
fi

View File

@@ -0,0 +1,21 @@
#!/bin/sh
if [ -f ch_lm_train100.noamp.decomposed.txt.gz ]; then
gz=.gz
else
gz=
sed 's,\.gz,,g' test.flm > test.flm.nogz && \
mv test.flm test.flm-gz && \
mv test.flm.nogz test.flm
fi
fngram-count -debug 2 -factor-file test.flm -text ch_lm_train100.noamp.decomposed.txt$gz -write-counts -lm -unk -nonull
fngram -debug 0 -factor-file test.flm -ppl ch_lm_dev.noamp.decomposed.txt$gz -unk -nonull
rm -f dev.count$gz dev.lm$gz
if [ -f test.flm-gz ]; then
mv test.flm-gz test.flm
fi

View File

@@ -0,0 +1,84 @@
##
##
##
## Factord Language Model File:
##
## Current set of Node Options
##
## gtmin [num]
## gtmax [num]
## gt [fileName string]
## cdiscount [double]
## ndiscount []
## wbdiscount []
## kndiscount []
## ukndiscount []
## kn-counts-modified []
## kn-counts-modify-at-end []
## kn [fileName string]
## kn-count-parent [parent spec]
## interpolate []
## write [fileName string]
##
## ## The next set of options are active only when there
## ## are multiple backoff paths (backoff-graph children) possible.
##
## strategy [option]
## where [option] is one of:
## counts_no_norm
## counts_sum_counts_norm <default>
## counts_sum_num_words_norm
## counts_prod_card_norm
## counts_sum_card_norm
## counts_sum_log_card_norm
## bog_node_prob
## combine [option]
## where [option] is one of:
## max <default>
## min
## sum
## avg||mean
## prod
## gmean
## wmean { <node_spec weight> <node_spec weight> ... }
##
##
##
## Factors that are currently available in the files in this directory:
##
## W - word (about 14k)
## M - morphological class (about 1279)
## S - stem (about 5281)
## R - root (about 3346)
## P - pattern (about 1516)
##
1
## bigram w. general backoff that gets better than trigram.
## logprob= -84967 ppl= 168.703 ppl1= 439.536
W : 3 W(-1) M(-1) S(-1) dev.count.gz dev.lm.gz 5
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
M1,S1 S1,M1 kndiscount gtmin 100000000 combine mean
M1 M1 kndiscount gtmin 3 kn-count-parent W1,M1,S1
S1 S1 kndiscount gtmin 1 kn-count-parent W1,M1,S1
0 0 kndiscount gtmin 1 kn-count-parent W1,M1,S1
## logprob= -84709 ppl= 166.097 ppl1= 431.488
W : 4 W(-1) W(-2) M(-1) S(-1) w_g_w1w2m1s1.count.gz w_g_w1w2m1s1.lm.gz 6
W1,W2,M1,S1 W2 kndiscount gtmin 3 interpolate
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
M1,S1 S1,M1 kndiscount gtmin 100000000 combine max strategy bog_node_prob
M1 M1 kndiscount gtmin 3 kn-count-parent W1,M1,S1
S1 S1 kndiscount gtmin 1 kn-count-parent W1,M1,S1
0 0 kndiscount gtmin 1 kn-count-parent W1,M1,S1
## 0 zeroprobs, logprob= -86272.5 ppl= 182.533 ppl1= 482.617
W : 4 W(-1) W(-2) M(-1) S(-1) w_g_w1w2m1s1.count.gz w_g_w1w2m1s1.lm.gz 5
W1,W2,M1,S1 W2 kndiscount gtmin 3 interpolate
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
M1,S1 M1 kndiscount gtmin 1 interpolate
S1 S1 kndiscount gtmin 1 interpolate
0 0 kndiscount gtmin 1

View File

@@ -0,0 +1,23 @@
#!/bin/sh
flags="-no-virtual-begin-sentence -no-virtual-end-sentence"
if [ -f ../fngram-count/ch_lm_train100.noamp.decomposed.txt.gz ]; then
gz=.gz
else
gz=
sed 's,\.gz,,g' test.flm > test.flm.nogz && \
mv test.flm test.flm-gz && \
mv test.flm.nogz test.flm
fi
fngram-count $flags -debug 2 -factor-file test.flm -text ../fngram-count/ch_lm_train100.noamp.decomposed.txt$gz -write-counts -lm -unk -nonull
ngram -debug 0 -order 2 -factored -lm test.flm -ppl ../fngram-count/ch_lm_dev.noamp.decomposed.txt$gz -unk -nonull
rm -f dev.count$gz dev.lm$gz
if [ -f test.flm-gz ]; then
mv test.flm-gz test.flm
fi

View File

@@ -0,0 +1,76 @@
##
##
##
## Factord Language Model File:
##
## Current set of Node Options
##
## gtmin [num]
## gtmax [num]
## gt [fileName string]
## cdiscount [double]
## ndiscount []
## wbdiscount []
## kndiscount []
## ukndiscount []
## kn-counts-modified []
## kn-counts-modify-at-end []
## kn [fileName string]
## kn-count-parent [parent spec]
## interpolate []
## write [fileName string]
##
## ## The next set of options are active only when there
## ## are multiple backoff paths (backoff-graph children) possible.
##
## strategy [option]
## where [option] is one of:
## counts_no_norm
## counts_sum_counts_norm <default>
## counts_sum_num_words_norm
## counts_prod_card_norm
## counts_sum_card_norm
## counts_sum_log_card_norm
## bog_node_prob
## combine [option]
## where [option] is one of:
## max <default>
## min
## sum
## avg||mean
## prod
## gmean
## wmean { <node_spec weight> <node_spec weight> ... }
##
##
##
## Factors that are currently available in the files in this directory:
##
## W - word (about 14k)
## M - morphological class (about 1279)
## S - stem (about 5281)
## R - root (about 3346)
## P - pattern (about 1516)
##
1
## bigram w. general backoff that gets better than trigram.
## logprob= -84967 ppl= 168.703 ppl1= 439.536
W : 3 W(-1) M(-1) S(-1) dev.count.gz dev.lm.gz 5
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
M1,S1 S1 kndiscount gtmin 2 interpolate
M1 M1 kndiscount gtmin 2 interpolate
S1 S1 kndiscount gtmin 1 interpolate
0 0 kndiscount gtmin 1
## bigram w. general backoff that gets better than trigram.
## logprob= -84967 ppl= 168.703 ppl1= 439.536
W : 3 W(-1) M(-1) S(-1) dev.count.gz dev.lm.gz 5
W1,M1,S1 W1 kndiscount gtmin 2 interpolate
M1,S1 S1,M1 kndiscount gtmin 100000000 combine mean
M1 M1 kndiscount gtmin 3 kn-count-parent W1,M1,S1
S1 S1 kndiscount gtmin 1 kn-count-parent W1,M1,S1
0 0 kndiscount gtmin 1 kn-count-parent W1,M1,S1