competition update

2025-07-02 12:18:09 -07:00
parent 9e17716a4a
commit 77dbcf868f
2615 changed files with 1648116 additions and 125 deletions
--- a/language_model/examples/handwriting/s0/local/build_lm.sh
+++ b/language_model/examples/handwriting/s0/local/build_lm.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+lm_src=$1
+lm_tgt_dir=$2
+dict_type=$3
+lm_order=$4
+prune_threshold=$5
+dict=$6
+
+[ ! -f $dict ] && echo "No such file $dict" && exit 1;
+
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+
+mkdir -p $lm_tgt_dir || exit 1;
+echo $lm_tgt_dir
+
+if [ $dict_type == 'phn' ]; then
+  # Remove stress markers
+  python local/remove_stress_marker.py \
+    $dict $lm_tgt_dir/dict
+  dict=$lm_tgt_dir/dict
+elif [ $dict_type == 'char' ]; then
+  cp $dict $lm_tgt_dir/dict
+fi
+
+# Unique words
+cat $dict | awk '{print $1}' | uniq  > $lm_tgt_dir/lexicons.txt
+
+# 3-gram LM
+ngram-count -debug 1 -order $lm_order -wbdiscount -interpolate \
+  -unk -map-unk "<unk>" -limit-vocab -vocab $lm_tgt_dir/lexicons.txt \
+  -text $lm_src -lm $lm_tgt_dir/lm_orig.arpa
+
+# Prune LM
+if [ $prune_threshold ==  "0" ]; then
+  ln -sf lm_orig.arpa $lm_tgt_dir/lm.arpa
+else
+  ngram -prune $prune_threshold -order $lm_order -lm $lm_tgt_dir/lm_orig.arpa -write-lm $lm_tgt_dir/lm_pruned.arpa
+  ln -sf lm_pruned.arpa $lm_tgt_dir/lm.arpa
+fi
+
+#rm $lm_tgt_dir/webTextSentences_uppercase.txt
--- a/language_model/examples/handwriting/s0/local/format_lm_data.py
+++ b/language_model/examples/handwriting/s0/local/format_lm_data.py
@@ -0,0 +1,79 @@
+import argparse
+import json
+import re
+
+parser = argparse.ArgumentParser(description='Format LM data')
+parser.add_argument('--input_text', type=str, required=True)
+parser.add_argument('--output_text', type=str, required=True)
+parser.add_argument('--dict', type=str, required=True)
+parser.add_argument('--with_punctuation', action='store_true')
+parser.add_argument('--with_space_symbol', action='store_true')
+parser.add_argument('--unk', action='store_true')
+args = parser.parse_args()
+
+# Read the dictionary
+lexicons = set()
+with open(args.dict, 'r') as f:
+    for line in f.readlines():
+        tokens = line.strip().split(' ')
+        lexicons.add(tokens[0].lower())
+
+# Preprocess texts and write to output
+output = open(args.output_text, 'w')
+input = open(args.input_text, 'r')
+count = 0
+while True:
+    line = input.readline()
+    if not line:
+        break
+    count += 1
+
+    if count % 10000 == 0:
+        print(count)
+
+    modifiedText = line.strip().replace('\n',' ')
+    modifiedText = modifiedText.replace('-',' ')
+    modifiedText = re.sub("[^a-z .',?]", '', modifiedText.lower())
+    modifiedText = re.sub(' +', ' ', modifiedText)
+    modifiedText = modifiedText.replace(' .','.')
+    modifiedText = modifiedText.replace(' ,',',')
+    modifiedText = modifiedText.replace(', ',',')
+    modifiedText = modifiedText.replace('..','.')
+    modifiedText = modifiedText.strip()
+
+    #split into sentences
+    modifiedText = modifiedText.replace('.','.\n')
+    modifiedText = modifiedText.replace('. ','.\n')
+    modifiedText = modifiedText.replace('?','?\n')
+    modifiedText = modifiedText.replace('? ','?\n')
+
+    allNewLines = modifiedText.split('\n')
+
+    for x in range(len(allNewLines)):
+        if len(allNewLines[x]) > 4:
+            newLine = allNewLines[x].strip()
+            if args.with_space_symbol:
+                newLine = newLine.replace(' ',' > ')
+
+            if args.with_punctuation:
+                newLine = newLine.replace('.',' .')
+                newLine = newLine.replace(',',' , ')
+                newLine = newLine.replace('?',' ?')
+            else:
+                newLine = newLine.replace('.','')
+                newLine = newLine.replace(',','')
+                newLine = newLine.replace('?','')
+
+            hasAllWords = True
+            if not args.unk:
+                words = newLine.split(' ')
+                for w in words:
+                    if not w in lexicons:
+                        hasAllWords = False
+                        break
+
+            if hasAllWords:
+                output.write(newLine.upper()+'\n')
+
+output.close()
+input.close()
--- a/language_model/examples/handwriting/s0/local/prepare_dict_ctc.sh
+++ b/language_model/examples/handwriting/s0/local/prepare_dict_ctc.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+# Prepares the dictionary and auto-generates the pronunciations for the words,
+# that are in our vocabulary but not in CMUdict
+
+lm_dir=$1
+dir=$2
+
+#vocab=$lm_dir/librispeech-lexicon_no_stress.txt
+vocab=$lm_dir/dict
+[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
+
+mkdir -p $dir || exit 1;
+echo $dir
+
+cat $vocab | \
+  perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; $s = join(" ",@A); print $s; print "\n"}}' \
+  > $dir/lexicon_raw_nosil.txt || exit 1;
+echo "lexicon_raw_nosil done"
+
+# awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $dir/lexicon_raw_nosil.txt |\
+#   sort -u |\
+#   perl -e 'while(<>){
+#     chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
+#     $phones_of{$1} .= "$_ "; }
+#     foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
+#     > $dir/units_nosil.txt || exit 1;
+
+cut -d' ' -f2- $dir/lexicon_raw_nosil.txt | tr ' ' '\n' | sort -u > $dir/units_nosil.txt
+echo "units_nosil.txt done"
+
+cat $dir/lexicon_raw_nosil.txt | sort | uniq > $dir/lexicon.txt || exit 1;
+
+#  The complete set of lexicon units, indexed by numbers starting from 1
+cat $dir/units_nosil.txt | awk '{print $1 " " NR}' > $dir/units.txt
+
+# Convert character sequences into the corresponding sequences of units indices, encoded by units.txt
+tools/sym2int.pl -f 2- $dir/units.txt < $dir/lexicon.txt > $dir/lexicon_numbers.txt
+echo "lexicon_numbers.txt done"
--- a/language_model/examples/handwriting/s0/local/remove_stress_marker.py
+++ b/language_model/examples/handwriting/s0/local/remove_stress_marker.py
@@ -0,0 +1,36 @@
+import os
+import sys
+
+assert len(sys.argv) == 3
+lexicon_in = sys.argv[1]
+lexicon_out = sys.argv[2]
+
+phones_with_stress = {'AA', 'AE', 'AH', 'AO', 'AW', \
+                      'AY', 'EH', 'ER', 'EY', 'IH', 'IY', \
+                      'OW', 'OY', 'UH', 'UW'
+                     }
+
+out_f = open(lexicon_out, 'w')
+with open(lexicon_in, 'r') as in_f:
+  lines = in_f.readlines()
+  for i, line in enumerate(lines):
+    line = line.strip()
+    if '\t' in line:
+      lexicon, phones = line.split('\t')
+      phones = phones.strip().split(' ')
+    else:
+      tokens = line.split(' ')
+      lexicon = tokens[0]
+      phones = []
+      for t in tokens[1:]:
+        if len(t) > 0:
+          phones.append(t)
+    new_phones = []
+    for p in phones:
+      if p[:-1] in phones_with_stress:
+        new_phones.append(p[:-1])
+      else:
+        new_phones.append(p)
+    #print(new_phones, " ".join(new_phones))
+    out_f.write(f'{lexicon}\t{" ".join(new_phones)}\n')
+out_f.close()