119 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
		
		
			
		
	
	
			119 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
|   | #!/bin/bash
 | ||
|  | # Copyright 2015       Yajie Miao    (Carnegie Mellon University) | ||
|  | 
 | ||
|  | # Licensed under the Apache License, Version 2.0 (the "License"); | ||
|  | # you may not use this file except in compliance with the License. | ||
|  | # You may obtain a copy of the License at | ||
|  | # | ||
|  | #  http://www.apache.org/licenses/LICENSE-2.0 | ||
|  | # | ||
|  | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
|  | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
|  | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
|  | # MERCHANTABLITY OR NON-INFRINGEMENT. | ||
|  | # See the Apache 2 License for the specific language governing permissions and | ||
|  | # limitations under the License. | ||
|  | 
 | ||
|  | # This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the | ||
|  | # phoneme and character-based lexicons. | ||
|  | 
 | ||
|  | dict_type="char"        # the type of lexicon, either "phn" or "char" | ||
|  | space_char=">"   # the character you have used to represent spaces | ||
|  | sil_prob=0 | ||
|  | 
 | ||
|  | set -eo pipefail | ||
|  | . tools/parse_options.sh | ||
|  | 
 | ||
|  | if [ $# -ne 3 ]; then | ||
|  |   echo "usage: ctc-crf/ctc_compile_dict_token.sh <dict-src-dir> <tmp-dir> <lang-dir>" | ||
|  |   echo "e.g.: ctc-crf/ctc_compile_dict_token.sh data/local/dict_phn data/local/lang_phn_tmp data/lang_phn" | ||
|  |   echo "<dict-src-dir> should contain the following files:" | ||
|  |   echo "lexicon.txt lexicon_numbers.txt units.txt" | ||
|  |   echo "options: " | ||
|  |   echo "     --dict-type <type of lexicon>                   # default: phn." | ||
|  |   echo "     --space-char <space character>                  # default: <SPACE>, the character to represent spaces." | ||
|  |   exit 1; | ||
|  | fi | ||
|  | 
 | ||
|  | echo "dict_type: $dict_type" | ||
|  | echo "space_char: $space_char" | ||
|  | echo "sil_prob: $sil_prob" | ||
|  | 
 | ||
|  | srcdir=$1 | ||
|  | tmpdir=$2 | ||
|  | dir=$3 | ||
|  | mkdir -p $dir $tmpdir | ||
|  | 
 | ||
|  | [ -f path.sh ] && . ./path.sh | ||
|  | 
 | ||
|  | cp $srcdir/{lexicon_numbers.txt,units.txt} $dir | ||
|  | 
 | ||
|  | # Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. | ||
|  | # But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. | ||
|  | perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; | ||
|  | 
 | ||
|  | # Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. | ||
|  | # Without these symbols, determinization will fail. | ||
|  | ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` | ||
|  | ndisambig=$[$ndisambig+1]; | ||
|  | 
 | ||
|  | ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list | ||
|  | 
 | ||
|  | # Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g., | ||
|  | # phonemes), and the disambiguation symbols. | ||
|  | cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list | ||
|  | (echo '<eps>'; echo '<blk>'; echo 'SIL') | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt | ||
|  | 
 | ||
|  | # Compile the tokens into FST | ||
|  | # utils/ctc_token_fst.py $dir/tokens.txt | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt \ | ||
|  | #   --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; | ||
|  | 
 | ||
|  | # Hongyu Xiang: Eesen ctc_token_fst.py makes mistakes, as described in the CTC-CRF paper. We correct it | ||
|  | tools/fst/ctc_token_fst_corrected.py decode $dir/tokens.txt | fstcompile| fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; | ||
|  | 
 | ||
|  | # Encode the words with indices. Will be used in lexicon and language model FST compiling. | ||
|  | cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk ' | ||
|  |   BEGIN { | ||
|  |     print "<eps> 0"; | ||
|  |   } | ||
|  |   { | ||
|  |     printf("%s %d\n", $1, NR); | ||
|  |   } | ||
|  |   END { | ||
|  |     printf("#0 %d\n", NR+1); | ||
|  |     printf("<s> %d\n", NR+2); | ||
|  |     printf("</s> %d\n", NR+3); | ||
|  |   }' > $dir/words.txt || exit 1; | ||
|  | 
 | ||
|  | # Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. | ||
|  | token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` | ||
|  | word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` | ||
|  | 
 | ||
|  | case $dict_type in | ||
|  |   phn) | ||
|  |      tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "SIL" '#'$ndisambig | \
 | ||
|  |        fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
 | ||
|  |        --keep_isymbols=false --keep_osymbols=false |   \
 | ||
|  |        fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
 | ||
|  |        fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; | ||
|  |        ;; | ||
|  |   char | bichar) | ||
|  |      echo "Building a character-based lexicon, with $space_char as the space" | ||
|  |      tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig | \
 | ||
|  |        fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
 | ||
|  |        --keep_isymbols=false --keep_osymbols=false |   \
 | ||
|  |        fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
 | ||
|  |        fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; | ||
|  |        ;; | ||
|  |   nchar) | ||
|  |      echo "Building a character-based lexicon, with $space_char as the space, but without space probability" | ||
|  |      tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob "$space_char" '#'$ndisambig | \
 | ||
|  |        fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
 | ||
|  |        --keep_isymbols=false --keep_osymbols=false |   \
 | ||
|  |        fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
 | ||
|  |        fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; | ||
|  |        ;; | ||
|  |   *) echo "$0: invalid dictionary type $dict_type" && exit 1; | ||
|  | esac | ||
|  | 
 | ||
|  | echo "Dict and token FSTs compiling succeeded" |