competition update

2025-07-02 12:18:09 -07:00
parent 9e17716a4a
commit 77dbcf868f
2615 changed files with 1648116 additions and 125 deletions
--- a/language_model/tools/fst/ctc_compile_dict_token.sh
+++ b/language_model/tools/fst/ctc_compile_dict_token.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright 2015       Yajie Miao    (Carnegie Mellon University)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
+# phoneme and character-based lexicons.
+
+dict_type="char"        # the type of lexicon, either "phn" or "char"
+space_char=">"   # the character you have used to represent spaces
+sil_prob=0
+
+set -eo pipefail
+. tools/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "usage: ctc-crf/ctc_compile_dict_token.sh <dict-src-dir> <tmp-dir> <lang-dir>"
+  echo "e.g.: ctc-crf/ctc_compile_dict_token.sh data/local/dict_phn data/local/lang_phn_tmp data/lang_phn"
+  echo "<dict-src-dir> should contain the following files:"
+  echo "lexicon.txt lexicon_numbers.txt units.txt"
+  echo "options: "
+  echo "     --dict-type <type of lexicon>                   # default: phn."
+  echo "     --space-char <space character>                  # default: <SPACE>, the character to represent spaces."
+  exit 1;
+fi
+
+echo "dict_type: $dict_type"
+echo "space_char: $space_char"
+echo "sil_prob: $sil_prob"
+
+srcdir=$1
+tmpdir=$2
+dir=$3
+mkdir -p $dir $tmpdir
+
+[ -f path.sh ] && . ./path.sh
+
+cp $srcdir/{lexicon_numbers.txt,units.txt} $dir
+
+# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
+# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
+perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
+
+# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
+# Without these symbols, determinization will fail.
+ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
+ndisambig=$[$ndisambig+1];
+
+( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
+
+# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g.,
+# phonemes), and the disambiguation symbols.
+cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
+(echo '<eps>'; echo '<blk>'; echo 'SIL') | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
+
+# Compile the tokens into FST
+# utils/ctc_token_fst.py $dir/tokens.txt | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt \
+#   --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
+
+# Hongyu Xiang: Eesen ctc_token_fst.py makes mistakes, as described in the CTC-CRF paper. We correct it
+tools/fst/ctc_token_fst_corrected.py decode $dir/tokens.txt | fstcompile| fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
+
+# Encode the words with indices. Will be used in lexicon and language model FST compiling.
+cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
+  BEGIN {
+    print "<eps> 0";
+  }
+  {
+    printf("%s %d\n", $1, NR);
+  }
+  END {
+    printf("#0 %d\n", NR+1);
+    printf("<s> %d\n", NR+2);
+    printf("</s> %d\n", NR+3);
+  }' > $dir/words.txt || exit 1;
+
+# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
+token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
+word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
+
+case $dict_type in
+  phn)
+     tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "SIL" '#'$ndisambig | \
+       fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
+       --keep_isymbols=false --keep_osymbols=false |   \
+       fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
+       fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+       ;;
+  char | bichar)
+     echo "Building a character-based lexicon, with $space_char as the space"
+     tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig | \
+       fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
+       --keep_isymbols=false --keep_osymbols=false |   \
+       fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
+       fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+       ;;
+  nchar)
+     echo "Building a character-based lexicon, with $space_char as the space, but without space probability"
+     tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig | \
+       fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
+       --keep_isymbols=false --keep_osymbols=false |   \
+       fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
+       fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+       ;;
+  *) echo "$0: invalid dictionary type $dict_type" && exit 1;
+esac
+
+echo "Dict and token FSTs compiling succeeded"