54 lines
1.8 KiB
Bash
Executable File
54 lines
1.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Copyright 2014 Vassil Panayotov
|
|
# Apache 2.0
|
|
|
|
# Prepares the dictionary and auto-generates the pronunciations for the words,
|
|
# that are in our vocabulary but not in CMUdict
|
|
|
|
lm_dir=$1
|
|
dir=$2
|
|
use_all_phonemes=$3
|
|
position_dependent_phone=$4
|
|
|
|
#vocab=$lm_dir/librispeech-lexicon_no_stress.txt
|
|
vocab=$lm_dir/dict
|
|
[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
|
|
|
|
mkdir -p $dir || exit 1;
|
|
echo $dir
|
|
|
|
if [ $position_dependent_phone == 1 ];then
|
|
python local/make_position_dependent_phones.py --dict $vocab --output $dir/pos_dep_dict.txt
|
|
vocab=$dir/pos_dep_dict.txt
|
|
fi
|
|
|
|
|
|
cat $vocab | \
|
|
perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; $s = join(" ",@A); print $s; print "\n"}}' \
|
|
> $dir/lexicon_raw_nosil.txt || exit 1;
|
|
echo "lexicon_raw_nosil done"
|
|
|
|
# awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $dir/lexicon_raw_nosil.txt |\
|
|
# sort -u |\
|
|
# perl -e 'while(<>){
|
|
# chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
|
|
# $phones_of{$1} .= "$_ "; }
|
|
# foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
|
|
# > $dir/units_nosil.txt || exit 1;
|
|
|
|
if [ $use_all_phonemes == 1 ]; then
|
|
cp local/all_phoneme_units.txt $dir/units_nosil.txt
|
|
else
|
|
cut -d' ' -f2- $dir/lexicon_raw_nosil.txt | tr ' ' '\n' | sort -u > $dir/units_nosil.txt
|
|
echo "units_nosil.txt done"
|
|
fi
|
|
|
|
cat $dir/lexicon_raw_nosil.txt | sort | uniq > $dir/lexicon.txt || exit 1;
|
|
|
|
# The complete set of lexicon units, indexed by numbers starting from 1
|
|
cat $dir/units_nosil.txt | awk '{print $1 " " NR}' > $dir/units.txt
|
|
|
|
# Convert character sequences into the corresponding sequences of units indices, encoded by units.txt
|
|
tools/sym2int.pl -f 2- $dir/units.txt < $dir/lexicon.txt > $dir/lexicon_numbers.txt
|
|
echo "lexicon_numbers.txt done" |