competition update
This commit is contained in:
48
language_model/examples/handwriting/s0/local/build_lm.sh
Executable file
48
language_model/examples/handwriting/s0/local/build_lm.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
|
||||
lm_src=$1
|
||||
lm_tgt_dir=$2
|
||||
dict_type=$3
|
||||
lm_order=$4
|
||||
prune_threshold=$5
|
||||
dict=$6
|
||||
|
||||
[ ! -f $dict ] && echo "No such file $dict" && exit 1;
|
||||
|
||||
# Check SRILM tools
|
||||
if ! which ngram-count > /dev/null; then
|
||||
echo "srilm tools are not found, please download it and install it from: "
|
||||
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||
echo "Then add the tools to your PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p $lm_tgt_dir || exit 1;
|
||||
echo $lm_tgt_dir
|
||||
|
||||
if [ $dict_type == 'phn' ]; then
|
||||
# Remove stress markers
|
||||
python local/remove_stress_marker.py \
|
||||
$dict $lm_tgt_dir/dict
|
||||
dict=$lm_tgt_dir/dict
|
||||
elif [ $dict_type == 'char' ]; then
|
||||
cp $dict $lm_tgt_dir/dict
|
||||
fi
|
||||
|
||||
# Unique words
|
||||
cat $dict | awk '{print $1}' | uniq > $lm_tgt_dir/lexicons.txt
|
||||
|
||||
# 3-gram LM
|
||||
ngram-count -debug 1 -order $lm_order -wbdiscount -interpolate \
|
||||
-unk -map-unk "<unk>" -limit-vocab -vocab $lm_tgt_dir/lexicons.txt \
|
||||
-text $lm_src -lm $lm_tgt_dir/lm_orig.arpa
|
||||
|
||||
# Prune LM
|
||||
if [ $prune_threshold == "0" ]; then
|
||||
ln -sf lm_orig.arpa $lm_tgt_dir/lm.arpa
|
||||
else
|
||||
ngram -prune $prune_threshold -order $lm_order -lm $lm_tgt_dir/lm_orig.arpa -write-lm $lm_tgt_dir/lm_pruned.arpa
|
||||
ln -sf lm_pruned.arpa $lm_tgt_dir/lm.arpa
|
||||
fi
|
||||
|
||||
#rm $lm_tgt_dir/webTextSentences_uppercase.txt
|
||||
@@ -0,0 +1,79 @@
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
|
||||
parser = argparse.ArgumentParser(description='Format LM data')
|
||||
parser.add_argument('--input_text', type=str, required=True)
|
||||
parser.add_argument('--output_text', type=str, required=True)
|
||||
parser.add_argument('--dict', type=str, required=True)
|
||||
parser.add_argument('--with_punctuation', action='store_true')
|
||||
parser.add_argument('--with_space_symbol', action='store_true')
|
||||
parser.add_argument('--unk', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Read the dictionary
|
||||
lexicons = set()
|
||||
with open(args.dict, 'r') as f:
|
||||
for line in f.readlines():
|
||||
tokens = line.strip().split(' ')
|
||||
lexicons.add(tokens[0].lower())
|
||||
|
||||
# Preprocess texts and write to output
|
||||
output = open(args.output_text, 'w')
|
||||
input = open(args.input_text, 'r')
|
||||
count = 0
|
||||
while True:
|
||||
line = input.readline()
|
||||
if not line:
|
||||
break
|
||||
count += 1
|
||||
|
||||
if count % 10000 == 0:
|
||||
print(count)
|
||||
|
||||
modifiedText = line.strip().replace('\n',' ')
|
||||
modifiedText = modifiedText.replace('-',' ')
|
||||
modifiedText = re.sub("[^a-z .',?]", '', modifiedText.lower())
|
||||
modifiedText = re.sub(' +', ' ', modifiedText)
|
||||
modifiedText = modifiedText.replace(' .','.')
|
||||
modifiedText = modifiedText.replace(' ,',',')
|
||||
modifiedText = modifiedText.replace(', ',',')
|
||||
modifiedText = modifiedText.replace('..','.')
|
||||
modifiedText = modifiedText.strip()
|
||||
|
||||
#split into sentences
|
||||
modifiedText = modifiedText.replace('.','.\n')
|
||||
modifiedText = modifiedText.replace('. ','.\n')
|
||||
modifiedText = modifiedText.replace('?','?\n')
|
||||
modifiedText = modifiedText.replace('? ','?\n')
|
||||
|
||||
allNewLines = modifiedText.split('\n')
|
||||
|
||||
for x in range(len(allNewLines)):
|
||||
if len(allNewLines[x]) > 4:
|
||||
newLine = allNewLines[x].strip()
|
||||
if args.with_space_symbol:
|
||||
newLine = newLine.replace(' ',' > ')
|
||||
|
||||
if args.with_punctuation:
|
||||
newLine = newLine.replace('.',' .')
|
||||
newLine = newLine.replace(',',' , ')
|
||||
newLine = newLine.replace('?',' ?')
|
||||
else:
|
||||
newLine = newLine.replace('.','')
|
||||
newLine = newLine.replace(',','')
|
||||
newLine = newLine.replace('?','')
|
||||
|
||||
hasAllWords = True
|
||||
if not args.unk:
|
||||
words = newLine.split(' ')
|
||||
for w in words:
|
||||
if not w in lexicons:
|
||||
hasAllWords = False
|
||||
break
|
||||
|
||||
if hasAllWords:
|
||||
output.write(newLine.upper()+'\n')
|
||||
|
||||
output.close()
|
||||
input.close()
|
||||
42
language_model/examples/handwriting/s0/local/prepare_dict_ctc.sh
Executable file
42
language_model/examples/handwriting/s0/local/prepare_dict_ctc.sh
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2014 Vassil Panayotov
|
||||
# Apache 2.0
|
||||
|
||||
# Prepares the dictionary and auto-generates the pronunciations for the words,
|
||||
# that are in our vocabulary but not in CMUdict
|
||||
|
||||
lm_dir=$1
|
||||
dir=$2
|
||||
|
||||
#vocab=$lm_dir/librispeech-lexicon_no_stress.txt
|
||||
vocab=$lm_dir/dict
|
||||
[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
|
||||
|
||||
mkdir -p $dir || exit 1;
|
||||
echo $dir
|
||||
|
||||
cat $vocab | \
|
||||
perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; $s = join(" ",@A); print $s; print "\n"}}' \
|
||||
> $dir/lexicon_raw_nosil.txt || exit 1;
|
||||
echo "lexicon_raw_nosil done"
|
||||
|
||||
# awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $dir/lexicon_raw_nosil.txt |\
|
||||
# sort -u |\
|
||||
# perl -e 'while(<>){
|
||||
# chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
|
||||
# $phones_of{$1} .= "$_ "; }
|
||||
# foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
|
||||
# > $dir/units_nosil.txt || exit 1;
|
||||
|
||||
cut -d' ' -f2- $dir/lexicon_raw_nosil.txt | tr ' ' '\n' | sort -u > $dir/units_nosil.txt
|
||||
echo "units_nosil.txt done"
|
||||
|
||||
cat $dir/lexicon_raw_nosil.txt | sort | uniq > $dir/lexicon.txt || exit 1;
|
||||
|
||||
# The complete set of lexicon units, indexed by numbers starting from 1
|
||||
cat $dir/units_nosil.txt | awk '{print $1 " " NR}' > $dir/units.txt
|
||||
|
||||
# Convert character sequences into the corresponding sequences of units indices, encoded by units.txt
|
||||
tools/sym2int.pl -f 2- $dir/units.txt < $dir/lexicon.txt > $dir/lexicon_numbers.txt
|
||||
echo "lexicon_numbers.txt done"
|
||||
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
assert len(sys.argv) == 3
|
||||
lexicon_in = sys.argv[1]
|
||||
lexicon_out = sys.argv[2]
|
||||
|
||||
phones_with_stress = {'AA', 'AE', 'AH', 'AO', 'AW', \
|
||||
'AY', 'EH', 'ER', 'EY', 'IH', 'IY', \
|
||||
'OW', 'OY', 'UH', 'UW'
|
||||
}
|
||||
|
||||
out_f = open(lexicon_out, 'w')
|
||||
with open(lexicon_in, 'r') as in_f:
|
||||
lines = in_f.readlines()
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if '\t' in line:
|
||||
lexicon, phones = line.split('\t')
|
||||
phones = phones.strip().split(' ')
|
||||
else:
|
||||
tokens = line.split(' ')
|
||||
lexicon = tokens[0]
|
||||
phones = []
|
||||
for t in tokens[1:]:
|
||||
if len(t) > 0:
|
||||
phones.append(t)
|
||||
new_phones = []
|
||||
for p in phones:
|
||||
if p[:-1] in phones_with_stress:
|
||||
new_phones.append(p[:-1])
|
||||
else:
|
||||
new_phones.append(p)
|
||||
#print(new_phones, " ".join(new_phones))
|
||||
out_f.write(f'{lexicon}\t{" ".join(new_phones)}\n')
|
||||
out_f.close()
|
||||
Reference in New Issue
Block a user