79 lines
1.6 KiB
Bash
Executable File
79 lines
1.6 KiB
Bash
Executable File
#!/bin/sh
|
|
#
|
|
# change-lm-vocab --
|
|
# create a language model from an existing one by changing its
|
|
# vocabulary.
|
|
# All n-grams in the new vocab are retained with their original
|
|
# probabilities. Backoff weights are recomputed and backed-off
|
|
# unigrams for all new words are added.
|
|
# -subset option performs subsetting of the vocabulary without adding
|
|
# new words.
|
|
#
|
|
# usage: change-lm-vocab [-subset] -vocab vocabfile -lm oldlm -write-lm newlm
|
|
#
|
|
# $Header: /home/srilm/CVS/srilm/utils/src/change-lm-vocab,v 1.9 2013/03/09 07:13:01 stolcke Exp $
|
|
#
|
|
|
|
oldlm=-
|
|
newlm=-
|
|
vocab=/dev/null
|
|
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
-vocab) vocab="$2" ; shift ;;
|
|
-lm) oldlm="$2" ; shift ;;
|
|
-write-lm) newlm="$2" ; shift ;;
|
|
-tolower) options="$options $1" ; tolower=1 ;;
|
|
-subset) subset=yes ;;
|
|
*) options="$options $1" ;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
# -subset prevents new words being added to the LM
|
|
if [ "$subset" ]; then
|
|
ngram_vocab="/dev/null"
|
|
else
|
|
ngram_vocab="$vocab"
|
|
fi
|
|
|
|
gzip -dcf $oldlm | ${GAWK-gawk} '
|
|
# read the vocab file
|
|
NR == 1 && vocab {
|
|
# always include sentence begin/end
|
|
is_word["<s>"] = is_word["</s>"] = 1;
|
|
|
|
while ((getline word < vocab) > 0) {
|
|
is_word[to_lower ? tolower(word) : word] = 1;
|
|
}
|
|
|
|
close(vocab);
|
|
}
|
|
# process old lm
|
|
NF==0 {
|
|
print; next;
|
|
}
|
|
/^ngram *[0-9][0-9]*=/ {
|
|
order = substr($2,1,index($2,"=")-1);
|
|
print;
|
|
next;
|
|
}
|
|
/^\\[0-9]-grams:/ {
|
|
currorder=substr($0,2,1);
|
|
print;
|
|
next;
|
|
}
|
|
/^\\/ {
|
|
print; next;
|
|
}
|
|
currorder {
|
|
for (i = 2 ; i <= currorder + 1; i ++) {
|
|
if (!((to_lower ? tolower($i) : $i) in is_word)) next;
|
|
}
|
|
print;
|
|
next;
|
|
}
|
|
{ print }
|
|
' vocab=$vocab to_lower=$tolower | \
|
|
ngram -lm - -vocab "$ngram_vocab" -renorm -write-lm "$newlm" $options
|