competition update

This commit is contained in:
nckcard
2025-07-02 12:18:09 -07:00
parent 9e17716a4a
commit 77dbcf868f
2615 changed files with 1648116 additions and 125 deletions

View File

@@ -0,0 +1,48 @@
#!/bin/bash
lm_src=$1
lm_tgt_dir=$2
dict_type=$3
lm_order=$4
prune_threshold=$5
dict=$6
[ ! -f $dict ] && echo "No such file $dict" && exit 1;
# Check SRILM tools
if ! which ngram-count > /dev/null; then
echo "srilm tools are not found, please download it and install it from: "
echo "http://www.speech.sri.com/projects/srilm/download.html"
echo "Then add the tools to your PATH"
exit 1
fi
mkdir -p $lm_tgt_dir || exit 1;
echo $lm_tgt_dir
if [ $dict_type == 'phn' ]; then
# Remove stress markers
python local/remove_stress_marker.py \
$dict $lm_tgt_dir/dict
dict=$lm_tgt_dir/dict
elif [ $dict_type == 'char' ]; then
cp $dict $lm_tgt_dir/dict
fi
# Unique words
cat $dict | awk '{print $1}' | uniq > $lm_tgt_dir/lexicons.txt
# 3-gram LM
ngram-count -debug 1 -order $lm_order -wbdiscount -interpolate \
-unk -map-unk "<unk>" -limit-vocab -vocab $lm_tgt_dir/lexicons.txt \
-text $lm_src -lm $lm_tgt_dir/lm_orig.arpa
# Prune LM
if [ $prune_threshold == "0" ]; then
ln -sf lm_orig.arpa $lm_tgt_dir/lm.arpa
else
ngram -prune $prune_threshold -order $lm_order -lm $lm_tgt_dir/lm_orig.arpa -write-lm $lm_tgt_dir/lm_pruned.arpa
ln -sf lm_pruned.arpa $lm_tgt_dir/lm.arpa
fi
#rm $lm_tgt_dir/webTextSentences_uppercase.txt

View File

@@ -0,0 +1,79 @@
import argparse
import json
import re
parser = argparse.ArgumentParser(description='Format LM data')
parser.add_argument('--input_text', type=str, required=True)
parser.add_argument('--output_text', type=str, required=True)
parser.add_argument('--dict', type=str, required=True)
parser.add_argument('--with_punctuation', action='store_true')
parser.add_argument('--with_space_symbol', action='store_true')
parser.add_argument('--unk', action='store_true')
args = parser.parse_args()
# Read the dictionary
lexicons = set()
with open(args.dict, 'r') as f:
for line in f.readlines():
tokens = line.strip().split(' ')
lexicons.add(tokens[0].lower())
# Preprocess texts and write to output
output = open(args.output_text, 'w')
input = open(args.input_text, 'r')
count = 0
while True:
line = input.readline()
if not line:
break
count += 1
if count % 10000 == 0:
print(count)
modifiedText = line.strip().replace('\n',' ')
modifiedText = modifiedText.replace('-',' ')
modifiedText = re.sub("[^a-z .',?]", '', modifiedText.lower())
modifiedText = re.sub(' +', ' ', modifiedText)
modifiedText = modifiedText.replace(' .','.')
modifiedText = modifiedText.replace(' ,',',')
modifiedText = modifiedText.replace(', ',',')
modifiedText = modifiedText.replace('..','.')
modifiedText = modifiedText.strip()
#split into sentences
modifiedText = modifiedText.replace('.','.\n')
modifiedText = modifiedText.replace('. ','.\n')
modifiedText = modifiedText.replace('?','?\n')
modifiedText = modifiedText.replace('? ','?\n')
allNewLines = modifiedText.split('\n')
for x in range(len(allNewLines)):
if len(allNewLines[x]) > 4:
newLine = allNewLines[x].strip()
if args.with_space_symbol:
newLine = newLine.replace(' ',' > ')
if args.with_punctuation:
newLine = newLine.replace('.',' .')
newLine = newLine.replace(',',' , ')
newLine = newLine.replace('?',' ?')
else:
newLine = newLine.replace('.','')
newLine = newLine.replace(',','')
newLine = newLine.replace('?','')
hasAllWords = True
if not args.unk:
words = newLine.split(' ')
for w in words:
if not w in lexicons:
hasAllWords = False
break
if hasAllWords:
output.write(newLine.upper()+'\n')
output.close()
input.close()

View File

@@ -0,0 +1,42 @@
#!/bin/bash
# Copyright 2014 Vassil Panayotov
# Apache 2.0
# Prepares the dictionary and auto-generates the pronunciations for the words,
# that are in our vocabulary but not in CMUdict
lm_dir=$1
dir=$2
#vocab=$lm_dir/librispeech-lexicon_no_stress.txt
vocab=$lm_dir/dict
[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
mkdir -p $dir || exit 1;
echo $dir
cat $vocab | \
perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; $s = join(" ",@A); print $s; print "\n"}}' \
> $dir/lexicon_raw_nosil.txt || exit 1;
echo "lexicon_raw_nosil done"
# awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $dir/lexicon_raw_nosil.txt |\
# sort -u |\
# perl -e 'while(<>){
# chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
# $phones_of{$1} .= "$_ "; }
# foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
# > $dir/units_nosil.txt || exit 1;
cut -d' ' -f2- $dir/lexicon_raw_nosil.txt | tr ' ' '\n' | sort -u > $dir/units_nosil.txt
echo "units_nosil.txt done"
cat $dir/lexicon_raw_nosil.txt | sort | uniq > $dir/lexicon.txt || exit 1;
# The complete set of lexicon units, indexed by numbers starting from 1
cat $dir/units_nosil.txt | awk '{print $1 " " NR}' > $dir/units.txt
# Convert character sequences into the corresponding sequences of units indices, encoded by units.txt
tools/sym2int.pl -f 2- $dir/units.txt < $dir/lexicon.txt > $dir/lexicon_numbers.txt
echo "lexicon_numbers.txt done"

View File

@@ -0,0 +1,36 @@
import os
import sys
assert len(sys.argv) == 3
lexicon_in = sys.argv[1]
lexicon_out = sys.argv[2]
phones_with_stress = {'AA', 'AE', 'AH', 'AO', 'AW', \
'AY', 'EH', 'ER', 'EY', 'IH', 'IY', \
'OW', 'OY', 'UH', 'UW'
}
out_f = open(lexicon_out, 'w')
with open(lexicon_in, 'r') as in_f:
lines = in_f.readlines()
for i, line in enumerate(lines):
line = line.strip()
if '\t' in line:
lexicon, phones = line.split('\t')
phones = phones.strip().split(' ')
else:
tokens = line.split(' ')
lexicon = tokens[0]
phones = []
for t in tokens[1:]:
if len(t) > 0:
phones.append(t)
new_phones = []
for p in phones:
if p[:-1] in phones_with_stress:
new_phones.append(p[:-1])
else:
new_phones.append(p)
#print(new_phones, " ".join(new_phones))
out_f.write(f'{lexicon}\t{" ".join(new_phones)}\n')
out_f.close()