b2txt25/language_model/tools/fst/ctc_compile_dict_token.sh

#!/bin/bash
# Copyright 2015       Yajie Miao    (Carnegie Mellon University)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
# phoneme and character-based lexicons.

dict_type="char"        # the type of lexicon, either "phn" or "char"
space_char=">"   # the character you have used to represent spaces
sil_prob=0

set -eo pipefail
. tools/parse_options.sh

if [ $# -ne 3 ]; then
  echo "usage: ctc-crf/ctc_compile_dict_token.sh <dict-src-dir> <tmp-dir> <lang-dir>"
  echo "e.g.: ctc-crf/ctc_compile_dict_token.sh data/local/dict_phn data/local/lang_phn_tmp data/lang_phn"
  echo "<dict-src-dir> should contain the following files:"
  echo "lexicon.txt lexicon_numbers.txt units.txt"
  echo "options: "
  echo "     --dict-type <type of lexicon>                   # default: phn."
  echo "     --space-char <space character>                  # default: <SPACE>, the character to represent spaces."
  exit 1;
fi

echo "dict_type: $dict_type"
echo "space_char: $space_char"
echo "sil_prob: $sil_prob"

srcdir=$1
tmpdir=$2
dir=$3
mkdir -p $dir $tmpdir

[ -f path.sh ] && . ./path.sh

cp $srcdir/{lexicon_numbers.txt,units.txt} $dir

# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;

# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
# Without these symbols, determinization will fail.
ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
ndisambig=$[$ndisambig+1];

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list

# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g.,
# phonemes), and the disambiguation symbols.
cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
(echo '<eps>'; echo '<blk>'; echo 'SIL') | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt

# Compile the tokens into FST
# utils/ctc_token_fst.py $dir/tokens.txt | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt \
#   --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;

# Hongyu Xiang: Eesen ctc_token_fst.py makes mistakes, as described in the CTC-CRF paper. We correct it
tools/fst/ctc_token_fst_corrected.py decode $dir/tokens.txt | fstcompile| fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;

# Encode the words with indices. Will be used in lexicon and language model FST compiling.
cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;

# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`

case $dict_type in
  phn)
     tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "SIL" '#'$ndisambig | \
       fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
       --keep_isymbols=false --keep_osymbols=false |   \
       fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
       fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
       ;;
  char | bichar)
     echo "Building a character-based lexicon, with $space_char as the space"
     tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig | \
       fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
       --keep_isymbols=false --keep_osymbols=false |   \
       fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
       fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
       ;;
  nchar)
     echo "Building a character-based lexicon, with $space_char as the space, but without space probability"
     tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig | \
       fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
       --keep_isymbols=false --keep_osymbols=false |   \
       fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
       fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
       ;;
  *) echo "$0: invalid dictionary type $dict_type" && exit 1;
esac

echo "Dict and token FSTs compiling succeeded"
competition update 2025-07-02 12:18:09 -07:00			`#!/bin/bash`
			`# Copyright 2015 Yajie Miao (Carnegie Mellon University)`

			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# THIS CODE IS PROVIDED AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY`
			`# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED`
			`# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,`
			`# MERCHANTABLITY OR NON-INFRINGEMENT.`
			`# See the Apache 2 License for the specific language governing permissions and`
			`# limitations under the License.`

			`# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the`
			`# phoneme and character-based lexicons.`

			`dict_type="char" # the type of lexicon, either "phn" or "char"`
			`space_char=">" # the character you have used to represent spaces`
			`sil_prob=0`

			`set -eo pipefail`
			`. tools/parse_options.sh`

			`if [ $# -ne 3 ]; then`
			`echo "usage: ctc-crf/ctc_compile_dict_token.sh <dict-src-dir> <tmp-dir> <lang-dir>"`
			`echo "e.g.: ctc-crf/ctc_compile_dict_token.sh data/local/dict_phn data/local/lang_phn_tmp data/lang_phn"`
			`echo "<dict-src-dir> should contain the following files:"`
			`echo "lexicon.txt lexicon_numbers.txt units.txt"`
			`echo "options: "`
			`echo " --dict-type <type of lexicon> # default: phn."`
			`echo " --space-char <space character> # default: <SPACE>, the character to represent spaces."`
			`exit 1;`
			`fi`

			`echo "dict_type: $dict_type"`
			`echo "space_char: $space_char"`
			`echo "sil_prob: $sil_prob"`

			`srcdir=$1`
			`tmpdir=$2`
			`dir=$3`
			`mkdir -p $dir $tmpdir`

			`[ -f path.sh ] && . ./path.sh`

			`cp $srcdir/{lexicon_numbers.txt,units.txt} $dir`

			`# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.`
			`# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.`
			`perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt \|\| exit 1;`

			`# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.`
			`# Without these symbols, determinization will fail.`
			ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
			`ndisambig=$[$ndisambig+1];`

			( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list

			`# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g.,`
			`# phonemes), and the disambiguation symbols.`
			`cat $srcdir/units.txt \| awk '{print $1}' > $tmpdir/units.list`
			`(echo '<eps>'; echo '<blk>'; echo 'SIL') \| cat - $tmpdir/units.list $tmpdir/disambig.list \| awk '{print $1 " " (NR-1)}' > $dir/tokens.txt`

			`# Compile the tokens into FST`
			`# utils/ctc_token_fst.py $dir/tokens.txt \| fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt \`
			`# --keep_isymbols=false --keep_osymbols=false \| fstarcsort --sort_type=olabel > $dir/T.fst \|\| exit 1;`

			`# Hongyu Xiang: Eesen ctc_token_fst.py makes mistakes, as described in the CTC-CRF paper. We correct it`
			`tools/fst/ctc_token_fst_corrected.py decode $dir/tokens.txt \| fstcompile\| fstarcsort --sort_type=olabel > $dir/T.fst \|\| exit 1;`

			`# Encode the words with indices. Will be used in lexicon and language model FST compiling.`
			`cat $tmpdir/lexiconp.txt \| awk '{print $1}' \| sort \| uniq \| awk '`
			`BEGIN {`
			`print "<eps> 0";`
			`}`
			`{`
			`printf("%s %d\n", $1, NR);`
			`}`
			`END {`
			`printf("#0 %d\n", NR+1);`
			`printf("<s> %d\n", NR+2);`
			`printf("</s> %d\n", NR+3);`
			`}' > $dir/words.txt \|\| exit 1;`

			`# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.`
			token_disambig_symbol=`grep \#0 $dir/tokens.txt \| awk '{print $2}'`
			word_disambig_symbol=`grep \#0 $dir/words.txt \| awk '{print $2}'`

			`case $dict_type in`
			`phn)`
			`tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "SIL" '#'$ndisambig \| \`
			`fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \`
			`--keep_isymbols=false --keep_osymbols=false \| \`
			`fstaddselfloops "echo $token_disambig_symbol \|" "echo $word_disambig_symbol \|" \| \`
			`fstarcsort --sort_type=olabel > $dir/L.fst \|\| exit 1;`
			`;;`
			`char \| bichar)`
			`echo "Building a character-based lexicon, with $space_char as the space"`
			`tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig \| \`
			`fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \`
			`--keep_isymbols=false --keep_osymbols=false \| \`
			`fstaddselfloops "echo $token_disambig_symbol \|" "echo $word_disambig_symbol \|" \| \`
			`fstarcsort --sort_type=olabel > $dir/L.fst \|\| exit 1;`
			`;;`
			`nchar)`
			`echo "Building a character-based lexicon, with $space_char as the space, but without space probability"`
			`tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig \| \`
			`fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \`
			`--keep_isymbols=false --keep_osymbols=false \| \`
			`fstaddselfloops "echo $token_disambig_symbol \|" "echo $word_disambig_symbol \|" \| \`
			`fstarcsort --sort_type=olabel > $dir/L.fst \|\| exit 1;`
			`;;`
			`*) echo "$0: invalid dictionary type $dict_type" && exit 1;`
			`esac`

			`echo "Dict and token FSTs compiling succeeded"`