competition update

This commit is contained in:
nckcard
2025-07-02 12:18:09 -07:00
parent 9e17716a4a
commit 77dbcf868f
2615 changed files with 1648116 additions and 125 deletions

View File

@@ -0,0 +1,226 @@
#
# File: Makefile.example
# Author: The SRI DECIPHER (TM) System
# Date: Thu Sep 9 12:04:47 1993
#
# Description:
# This is the example makefile to start from when adding new
# modules to the DECIPHER System. To use this makefile, first
# copy it to your directory as the file "Makefile". Second,
# replace the word "Example" in the text below with the real name
# of your library. Next replace the the example filenames with
# the names of your actual declarations and source files in the
# appropriate variable definitions. Finally clean up by deleting
# any lines not relevant to your module and updating this header
# to describe your new module. Do not forget to use the proper
# RCS keywords!
#
# Copyright (c) 1993, SRI International. All Rights Reserved.
#
# $Header: /home/srilm/CVS/srilm/utils/src/Makefile,v 1.76 2019/02/09 07:36:09 stolcke Exp $
#
# Include common SRILM variable definitions.
include $(SRILM)/common/Makefile.common.variables
# This should enable locale-specific string collation for vocabulary sorting
# (it will slow things down somewhat).
#ADDITIONAL_CXXFLAGS = -Dstrcmp=strcoll
# Flags for generating "compact" data structures
COMPACT_FLAGS += -DUSE_SARRAY -DUSE_SARRAY_TRIE -DUSE_SARRAY_MAP2
# Flags for generating "short" data structures
SHORT_FLAGS = $(COMPACT_FLAGS) -DUSE_SHORT_VOCAB -DUSE_XCOUNTS
# Flags for generating "long long" data structures
LLONG_FLAGS = $(COMPACT_FLAGS) -DUSE_LONGLONG_COUNTS -DUSE_XCOUNTS
# enable use of liblbfgs if indicated
ifneq ($(HAVE_LIBLBFGS), )
ADDITIONAL_CFLAGS += -DHAVE_LIBLBFGS
ADDITIONAL_CXXFLAGS += -DHAVE_LIBLBFGS
endif
ADDITIONAL_LDFLAGS += \
$(MATHERR_LINK)
ADDITIONAL_LIBRARIES += \
$(SRILM_LIBDIR)/$(LIB_PREFIX)oolm$(LIB_SUFFIX) \
$(SRILM_LIBDIR)/$(LIB_PREFIX)dstruct$(LIB_SUFFIX) \
$(SRILM_LIBDIR)/$(LIB_PREFIX)misc$(LIB_SUFFIX) \
$(SRILM_LIBDIR)/$(LIB_PREFIX)z$(LIB_SUFFIX) \
$(MATH_LIBRARY) \
$(LBFGS_LIBRARY)
# Exported programs.
REAL_PROGRAM_NAMES = \
nbest-rover-helper
# Example programs.
PROGRAM_NAMES = $(REAL_PROGRAM_NAMES)
PROGRAMS = $(PROGRAM_NAMES:%=$(BINDIR)/%$(EXE_SUFFIX))
PROGRAM_SOURCES = $(foreach prog,$(PROGRAM_NAMES),\
$(wildcard $(SRCDIR)/$(prog).c) \
$(wildcard $(SRCDIR)/$(prog).cc))
PROGRAM_OBJECTS = $(PROGRAM_NAMES:%=$(OBJDIR)/%$(OBJ_SUFFIX))
# Libraries to be linked with the Example programs.
LIBRARIES = $(LIBRARY) \
$(ADDITIONAL_LIBRARIES)
# All of the types of files.
ALL_SOURCES = $(PROGRAM_SOURCES)
ALL_OBJECTS = $(PROGRAM_OBJECTS)
ALL_PROGRAMS = $(PROGRAMS)
ALL_PROGRAM_NAMES = $(PROGRAM_NAMES)
#
SCRIPTS = \
rescore-nbest \
wordlat-to-lisp \
extract-skip-probs \
$(EXPORTED_SCRIPTS)
EXPORTED_SCRIPTS = \
change-lm-vocab \
empty-sentence-lm \
rescore-decipher \
rescore-acoustic \
rescore-reweight \
rescore-minimize-wer \
make-batch-counts \
merge-batch-counts \
make-big-lm \
make-multiword-pfsg \
pfsg-from-ngram \
nbest-error \
nbest-rover \
search-rover-combo \
rexport.gnumake \
align-with-tags \
compute-sclite \
compute-sclite-nbest \
compare-sclite \
cumbin
# scripts that need to be edited before installation
EDIT_SCRIPTS = \
add-classes-to-pfsg \
add-dummy-bows \
add-pauses-to-pfsg \
add-ppls \
bytelog-to-log10 \
classes-to-fsm \
combine-acoustic-scores \
combine-rover-controls \
rover-control-weights \
rover-control-tying \
compare-ppls \
compute-best-mix \
compute-best-rover-mix \
compute-best-sentence-mix \
compute-oov-rate \
concat-sausages \
context-ngrams \
continuous-ngram-count \
de-vq-lm \
extract-skip-probs \
filter-event-counts \
find-reference-posteriors \
fix-ctm \
fsm-to-pfsg \
get-gt-counts \
get-unigram-probs \
hits-from-log \
log10-to-bytelog \
make-abs-discount \
make-diacritic-map \
make-google-ngrams \
make-gt-discounts \
make-kn-discounts \
make-kn-counts \
make-hiddens-lm \
make-lm-subset \
make-nbest-pfsg \
make-ngram-pfsg \
make-sub-lm \
metadb \
sort-lm \
reverse-lm \
merge-nbest \
nbest-posteriors \
nbest2-to-nbest1 \
nbest-optimize-args-from-rover-control \
nbest-oov-counts \
nbest-vocab \
nbest-words \
pfsg-to-dot \
pfsg-to-fsm \
pfsg-vocab \
htklat-vocab \
ppl-from-log \
remove-lowprob-ngrams \
replace-unk-words \
replace-words-with-classes \
reverse-text \
reverse-ngram-counts \
sentid-to-sclite \
sentid-to-ctm \
split-tagged-ngrams \
subset-context-ngrams \
subtract-ppls \
tolower-ngram-counts \
uniform-classes \
uniq-ngram-counts \
vp2text \
wlat-to-dot \
wlat-to-pfsg \
wlat-stats \
wordlat-to-lisp \
prettify \
select-vocab
# Define targets.
all: $(PROGRAMS)
$(LIBRARY): $(LIB_OBJECTS)
$(ARCHIVE) $(AR_OUTPUT_OPTION) $^ $(DEMANGLE_FILTER)
$(RANLIB) $@ $(DEMANGLE_FILTER)
$(PROGRAMS): $(LIBRARY) $(OTHER_LIBRARIES)
# Variables and Targets for released system
EXPORTED_PROGRAMS = \
$(EDIT_SCRIPTS:%=$(BINDIR)/%) \
$(REAL_PROGRAM_NAMES:%=$(BINDIR)/%$(EXE_SUFFIX))
release: release-scripts release-programs
# Include common SRILM target definitions.
include $(SRILM)/common/Makefile.common.targets
#
# Rule to create edited gawk script
#
$(BINDIR)/%: $(SRCDIR)/%.gawk $(BINDIR_STAMP) $(SRILM)/common/Makefile.machine.$(MACHINE_TYPE)
sed -e '1s,/usr/local/bin/gawk,$(GAWK),' $< >$@.new
mv $@.new $@
#
# Rule to create edited perl script
#
$(BINDIR)/%: $(SRCDIR)/%.pl $(BINDIR_STAMP) $(SRILM)/common/Makefile.machine.$(MACHINE_TYPE)
sed -e '1s,/usr/local/bin/perl,$(PERL),' $< >$@.new
mv $@.new $@

View File

@@ -0,0 +1,172 @@
#!/usr/local/bin/gawk -f
#
# add-classes-to-pfsg --
# Modify Decipher PFSG by expanding class nodes with words
#
# usage: add-classes-to-pfsg classes=<expansions> pfsg > expanded-pfsg
#
# $Header: /home/srilm/CVS/srilm/utils/src/add-classes-to-pfsg.gawk,v 1.5 2004/11/02 02:00:35 stolcke Exp $
#
function read_classes(file) {
num_class_defs = 0;
delete num_class_expansions;
delete class_expansions;
delete class_expansion_probs;
while ((getline line < file) > 0) {
n = split(line, a);
if (n == 0) continue;
class = a[1];
num_exp = ++ num_class_expansions[class];
if (a[2] ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
prob = a[2];
i = 3;
} else {
prob = "";
i = 2;
}
expansion = a[i];
for (i++; i <= n; i++) {
expansion = expansion " " a[i];
}
class_expansions[class " " num_exp] = expansion;
if (prob != "") {
class_expansion_probs[class " " num_exp] = prob;
}
num_class_defs ++;
}
print "read " num_class_defs " class expansions" >> "/dev/stderr";
# assign default expansion probs
for (class in num_class_expansions) {
num_exp = num_class_expansions[class];
for (i = 1; i <= num_exp; i ++) {
if (class_expansion_probs[class " " i] == "") {
class_expansion_probs[class " " i] = 1/num_exp;
}
}
}
}
######################################################################
BEGIN {
logscale = 10000.5;
round = 0.5;
null = "NULL";
classes_toupper = 1; # map class names to upper case
}
function rint(x) {
if (x < 0) {
return int(x - round);
} else {
return int(x + round);
}
}
function scale_prob(x) {
return rint(log(x) * logscale);
}
function print_class_pfsg(class) {
print "name " (classes_toupper ? toupper(class) : class);
# compute total number of nodes needed
num_exp = num_class_expansions[class];
num_words = 0;
all_words = "";
for (i = 1; i <= num_exp; i ++) {
num_words += split(class_expansions[class " " i], a);
all_words = all_words " " class_expansions[class " " i];
}
print "nodes " (num_words + 2) " " null " " null all_words;
initial = 0;
final = 1;
print "initial " initial;
print "final " final;
print "transitions " (num_words + num_exp);
node_index = final;
for (i = 1; i <= num_exp; i ++) {
n = split(class_expansions[class " " i], a);
if (n == 0) {
print initial, final, \
scale_prob(class_expansion_probs[class " " i]);
} else {
print initial, ++node_index, \
scale_prob(class_expansion_probs[class " " i]);
for (k = 2; k <= n; k ++) {
print node_index, node_index + 1, 0;
node_index ++;
}
print node_index, final, 0;
}
}
print "";
}
NR == 1 {
if (classes) {
read_classes(classes);
}
close(classes);
}
# record class names used in PFSGs
$1 == "nodes" {
for (i = 3; i <= NF; i ++) {
if ($i != null && $i in num_class_expansions) {
class_used[$i] = 1;
if (classes_toupper) {
upper_class = toupper($i);
if ($i != upper_class && upper_class in num_class_expansions) {
print "cannot map class " $i \
" to uppercase due to name conflict" >> "/dev/stderr";
exit 1;
}
$i = upper_class;
}
}
}
print;
next;
}
# pass old PFSGs through unchanged
{
print;
}
# dump out class PFSGs
END {
print "";
for (class in class_used) {
print_class_pfsg(class);
}
}

View File

@@ -0,0 +1,35 @@
#!/usr/local/bin/gawk -f
#
# add-dummy-bows --
# add redundant backoff weights to model file to make some broken
# programs happy.
# (Normally a backoff weight is only required for ngrams that
# are prefixes of longer ngrams.)
#
# $Header: /home/srilm/CVS/srilm/utils/src/add-dummy-bows.gawk,v 1.1 1995/09/20 17:36:30 stolcke Exp $
#
NF==0 {
print; next;
}
/^ngram *[0-9][0-9]*=/ {
order = substr($2,1,index($2,"=")-1);
if (order > highorder) highorder = order;
print;
next;
}
/^.[0-9]-grams:/ {
currorder=substr($0,2,1);
}
/^\\/ {
print; next;
}
currorder && currorder < highorder {
if (NF < currorder + 2) {
print $0 "\t0";
} else {
print;
}
next;
}
{ print }

View File

@@ -0,0 +1,171 @@
#!/usr/local/bin/gawk -f
#
# add-pauses-to-pfsg --
# Modify Decipher PFSG to allow pauses between words
#
# $Header: /home/srilm/CVS/srilm/utils/src/add-pauses-to-pfsg.gawk,v 1.15 2015-07-03 03:44:52 stolcke Exp $
#
BEGIN {
pause = "-pau-";
top_level_name = "TOP_LEVEL";
pause_filler_name = "PAUSE_FILLER";
null = "NULL";
wordwrap = 1; # wrap pause filler around words
pauselast = 0; # make pauses follow wrapped words
version = 0; # no "version" line by default
}
#
# output the TOP_LEVEL model
# oldname is the name of the original pfsg
function print_top_level(oldname) {
if (version) {
print "version " version "\n";
}
print "name " top_level_name;
if (pauselast) {
print "nodes 4 " null " " pause_filler_name " " oldname " " null;
} else {
print "nodes 4 " null " " oldname " " pause_filler_name " " null;
}
print "initial 0"
print "final 3"
print "transitions 4"
print "0 1 0"
print "1 2 0"
if (pauselast) {
print "0 2 0"
} else {
print "1 3 0"
}
print "2 3 0"
print "";
}
function word_wrapper_name(word) {
return "_" word "_PF";
}
#
# output a pause wrapper for word
#
function print_word_wrapper(word) {
print "name " word_wrapper_name(word);
if (pauselast) {
print "nodes 3 " word " " pause_filler_name " " null;
} else {
print "nodes 3 " null " " pause_filler_name " " word;
}
print "initial 0";
print "final 2";
print "transitions 3";
print "0 1 0";
print "1 2 0";
print "0 2 0";
print "";
}
#
# output the pause filler
#
function print_pause_filler() {
print "name " pause_filler_name;
print "nodes 3 " null " " pause " " null;
print "initial 0";
print "final 2";
print "transitions 3";
print "0 1 0";
print "1 1 0";
print "1 2 0";
}
NF == 0 {
print;
next;
}
#
# read vocabulary list if supplied
#
NR == 1 && vocab != "" {
while (getline line < vocab) {
if (split(line, a)) {
word_list[a[1]] = 1;
}
}
close (vocab);
}
#
# check that a node name is word
# if a vocabulary was not specified we use the following heuristic:
# word nodes contain at least one lowercase or non-ascii character and are not
# surrounded by "*...*" (which indicates a class name).
#
function is_word(w) {
if (vocab) {
return w in word_list;
} else {
return !is_classname(w);
}
}
function is_classname(w) {
return w ~ /^\*.*\*$/ || !(w ~ /[[:lower:]]/ || w ~ /[^\x00-\x7F]/);
}
#
# first time we see a pfsg name, issue a top-level wrapper for it.
#
$1 == "name" && !have_top_level {
print_top_level($2);
print;
have_top_level = 1;
next;
}
#
# maps word nodes to wrapper nodes
#
$1 == "nodes" {
numnodes = $2;
printf "nodes %d", numnodes;
for (i = 0; i < numnodes; i ++) {
node_name = $(i + 3);
# if it contains lowercase characters it's a word and
# needs to wrapped
if (wordwrap && is_word(node_name) && \
node_name != pause && node_name != null)
{
if (!(node_name in all_words)) {
all_words[node_name] = 1;
words[++num_words] = node_name;
}
printf " %s", word_wrapper_name(node_name);
} else {
printf " %s", node_name;
}
}
printf "\n";
next;
}
{
print;
}
END {
#
# output the word wrappers
#
if (wordwrap) {
for (i = 1; i <= num_words; i ++) {
print_word_wrapper(words[i]);
}
}
print_pause_filler();
}

View File

@@ -0,0 +1,30 @@
#!/usr/local/bin/gawk -f
#
# add-ppls --
# Add text statistics (from -ppl output)
#
# Copyright (c) 1995,1997 SRI International. All Rights Reserved
#
# $Header: /home/srilm/CVS/srilm/utils/src/add-ppls.gawk,v 1.2 1997/07/12 05:01:08 stolcke Exp $
#
/^file .*: .* sentences/ {
totalsents += $3;
totalwords += $5;
totaloovs += $7;
getline;
zeroprobs += $1;
totalprob += $4;
}
END {
M_LN10 = 2.30258509299404568402; # from <math.h>
ppl = exp (- M_LN10 * totalprob / \
(totalwords - totaloovs - zeroprobs + totalsents));
printf "file TOTAL: %d sentences, %d words, %d OOVs\n", \
totalsents, totalwords, totaloovs;
printf "%d zeroprobs, logprob= %g ppl= %g\n", \
zeroprobs, totalprob, ppl;
}

View File

@@ -0,0 +1,194 @@
#!/bin/sh
#
# align-with-tags --
# align reference transcript with tags to hypothesized
# transcripts, merging the tags into the latter
#
# $Header: /home/srilm/CVS/srilm/utils/src/align-with-tags,v 1.7 2015-07-03 03:45:38 stolcke Exp $
#
usage () {
echo "usage: $0 [-r ref -h hyp] [-dictionary D] [-aligndir A] [-options...]" >&2
exit 2;
}
ref=/dev/null
hyp=/dev/null
dictionary=/dev/null
while [ $# -gt 0 ]; do
case "$1" in
-r) ref="$2"
shift; shift;;
-h) hyp="$2"
shift; shift;;
-dictionary)
dictionary=$2
shift; shift;;
-aligndir)
aligndir=$2
shift; shift;;
-\?) usage;;
-*) pass_options="$pass_option $1"
shift;;
*) break;;
esac
done
if [ $# -ge 2 ]; then
ref="$1"
hyp="$2"
elif [ $# -gt 0 ]; then
usage;
fi
tmpdir=${TMPDIR-/tmp}
tmpdict="$tmpdir/dict$$"
tmptags="$tmpdir/tags$$"
tmprefs="$tmpdir/refs$$"
tmphyps="$tmpdir/hyps$$"
tmpnbest="$tmpdir/nbest$$"
tmpmerge="$tmpdir/merged$$"
if [ -n "$aligndir" ]; then
tmpmerge=
fi
trap "rm -rf $tmpdict $tmptags $tmprefs $tmphyps $tmpnbest $tmpmerge; exit" 0 1 2 15
if [ -n "$aligndir" ]; then
mkdir -p $aligndir
tmpmerge=$aligndir
fi
prepare_text () {
${GAWK-gawk} -v tag_file=$2 '
BEGIN {
tag_list["<default>"] = 1;
}
function is_tag(x) {
return (x ~ /^<.*>$/);
}
{
for (i = 2; i <= NF; i ++) {
if (is_tag($i)) {
tag_list[$i] = 1;
} else {
$i = tolower($i);
}
if (!is_tag($(i - 1)) && !is_tag($i)) {
$(i - 1) = $(i - 1) " <default>";
}
}
if (!is_tag($NF)) {
$NF = $NF " <default>";
}
print $0;
}
END {
if (tag_file) {
for (tag in tag_list) {
print tag > tag_file;
}
}
}' $1;
}
parse_alignment () {
gzip -d -c -f < $1 | \
${GAWK-gawk} -v sentid=$2 'BEGIN {
output = sentid;
show_refs = 1;
}
function is_empty(x) {
return x == "<default>" || tolower(x) == "*delete*";
}
function is_tag(x) {
return x ~ /^<.*>$/;
}
$1 == "align" {
if (NF == 4 && $4 == 1) {
# matching hyp and ref
if (!is_empty($3)) {
output = output " " $3;
}
} else if (NF == 6 && $4 == 1 && $6 == 0) {
# mismatched hyp and ref
if (is_empty($3)) {
if (is_tag($5)) {
if (!is_empty($5)) \
output = output " " $5;
} else if (showrefs) {
output = output " (" $5 ")";
}
} else {
if (is_empty($5) || !showrefs) {
output = output " " $3;
} else {
output = output " " $3 " (" $5 ")";
}
}
} else {
print "unexpected alignment: " $0 > "/dev/stderr";
}
}
END {
print output;
}'
}
set -e
#
# format hyps and refs for alignment
#
prepare_text $ref $tmptags > $tmprefs
prepare_text $hyp > $tmphyps
#
# add tag pronunciations to the dictionary
#
if [ $dictionary != /dev/null ]; then
gzip -d -c -f $dictionary > $tmpdict
else
> $tmpdict
fi
${GAWK-gawk} '{ print $1, "**TAG**" }' $tmptags >> $tmpdict
#
# do the alignments
#
mkdir -p $tmpnbest $tmpmerge
cat $tmphyps | \
while read sentid words
do
echo "0 0 0 $words" > $tmpnbest/$sentid
echo $tmpnbest/$sentid
done | \
nbest-lattice -nbest-files - \
-use-mesh \
-dictionary $tmpdict \
-keep-noise \
-refs "$tmprefs" \
$pass_options \
-write-dir $tmpmerge | \
(
last_sentid=
while read sentid rest
do
if [ -n "$last_sentid" ]; then
parse_alignment $tmpmerge/$last_sentid.gz $last_sentid
fi
last_sentid=$sentid
done
if [ -n "$last_sentid" ]; then
parse_alignment $tmpmerge/$last_sentid.gz $last_sentid
fi
)

View File

@@ -0,0 +1,19 @@
#!/usr/local/bin/gawk -f
#
# bytelog-to-log10 --
# convert bytelog scores to log-base-10
#
# $Header: /home/srilm/CVS/srilm/utils/src/bytelog-to-log10.gawk,v 1.2 2002/05/15 04:47:13 stolcke Exp $
#
BEGIN {
logscale = 2.30258509299404568402 * 10000.5 / 1024.0;
scale = 1;
}
{
for (i = 1; i <= NF; i ++) {
if ($i ~ /^[-+]+[0-9][0-9]*$/) {
$i = $i / scale / logscale;
}
}
print;
}

View File

@@ -0,0 +1,78 @@
#!/bin/sh
#
# change-lm-vocab --
# create a language model from an existing one by changing its
# vocabulary.
# All n-grams in the new vocab are retained with their original
# probabilities. Backoff weights are recomputed and backed-off
# unigrams for all new words are added.
# -subset option performs subsetting of the vocabulary without adding
# new words.
#
# usage: change-lm-vocab [-subset] -vocab vocabfile -lm oldlm -write-lm newlm
#
# $Header: /home/srilm/CVS/srilm/utils/src/change-lm-vocab,v 1.9 2013/03/09 07:13:01 stolcke Exp $
#
oldlm=-
newlm=-
vocab=/dev/null
while [ $# -gt 0 ]; do
case "$1" in
-vocab) vocab="$2" ; shift ;;
-lm) oldlm="$2" ; shift ;;
-write-lm) newlm="$2" ; shift ;;
-tolower) options="$options $1" ; tolower=1 ;;
-subset) subset=yes ;;
*) options="$options $1" ;;
esac
shift
done
# -subset prevents new words being added to the LM
if [ "$subset" ]; then
ngram_vocab="/dev/null"
else
ngram_vocab="$vocab"
fi
gzip -dcf $oldlm | ${GAWK-gawk} '
# read the vocab file
NR == 1 && vocab {
# always include sentence begin/end
is_word["<s>"] = is_word["</s>"] = 1;
while ((getline word < vocab) > 0) {
is_word[to_lower ? tolower(word) : word] = 1;
}
close(vocab);
}
# process old lm
NF==0 {
print; next;
}
/^ngram *[0-9][0-9]*=/ {
order = substr($2,1,index($2,"=")-1);
print;
next;
}
/^\\[0-9]-grams:/ {
currorder=substr($0,2,1);
print;
next;
}
/^\\/ {
print; next;
}
currorder {
for (i = 2 ; i <= currorder + 1; i ++) {
if (!((to_lower ? tolower($i) : $i) in is_word)) next;
}
print;
next;
}
{ print }
' vocab=$vocab to_lower=$tolower | \
ngram -lm - -vocab "$ngram_vocab" -renorm -write-lm "$newlm" $options

View File

@@ -0,0 +1,134 @@
#!/usr/local/bin/gawk -f
#
# usage: classes-to-fsm [symbolic=1] [isymbolfile=ISYMBOLS] [osymbolfile=OSYMBOLS] \
# vocab=VOCAB CLASSES > class.fsm
#
# where ISYMBOLS is the input symbol table, OSYMBOLS is the output symbol table
# VOCAB is the word list
#
# $Header: /home/srilm/CVS/srilm/utils/src/classes-to-fsm.gawk,v 1.1 1999/09/27 01:10:27 stolcke Exp $
#
BEGIN {
empty_input = "NULL";
empty_output = "NULL";
input_symbols[empty_input] = 0;
output_symbols[empty_output] = 0;
numinputs = 1;
numoutputs = 1;
isymbolfile = "";
osymbolfile = "";
symbolic = 0;
startstate = 0;
numstates = 1;
M_LN10 = 2.30258509299404568402; # from <math.h>
logscale = 10000.5;
round = 0.5;
}
NR == 1 {
# print start/end state
print startstate;
if (vocab) {
while ((getline vline < vocab) > 0) {
if (split(vline, a) >= 1) {
word = a[1];
input_symbols[word] = numinputs ++;
output_symbols[word] = numoutputs ++;
# print identity transition for vocab words
print startstate, startstate, \
(symbolic ? word : input_symbols[word]), \
(symbolic ? word : output_symbols[word]);
}
}
}
}
function rint(x) {
if (x < 0) {
return int(x - round);
} else {
return int(x + round);
}
}
function scale_prob(x) {
return rint(log(x) * logscale);
# return log(x) / M_LN10;
}
# input format is
# CLASS [PROB] WORD1 WORD2 ... WORDN
{
if (NF == 0) {
next;
}
class = $1;
if (!(class in input_symbols)) {
input_symbols[class] = numinputs++;
}
if ($2 ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
prob = $2;
first = 3;
} else {
prob = 1;
first = 2;
}
# deal with empty class expansion: map class to NULL
if (first > NF) {
print startstate, startstate, \
(symbolic ? class : input_symbols[class]), \
(symbolic ? empty_output : 0), \
-scale_prob(prob);
}
for (i = first; i <= NF; i ++) {
if (!($i in output_symbols)) {
output_symbols[$i] = numoutputs ++;
}
if (i == NF) {
next_state = startstate;
} else {
next_state = numstates ++;
}
if (i == first) {
print startstate, next_state,
(symbolic ? class : input_symbols[class]), \
(symbolic ? $i : output_symbols[$i]), \
-scale_prob(prob);
} else {
print last_state, next_state,
(symbolic ? empty_input : 0), \
(symbolic ? $i : output_symbols[$i]), \
-scale_prob(1);
}
last_state = next_state;
}
}
END {
if (isymbolfile) {
for (word in input_symbols) {
print word, input_symbols[word] > isymbolfile;
}
close(isymbolfile);
}
if (osymbolfile) {
for (word in output_symbols) {
print word, output_symbols[word] > osymbolfile;
}
close(osymbolfile);
}
}

View File

@@ -0,0 +1,114 @@
#!/usr/local/bin/gawk -f
#
# combine acoustic scores in nbest lists with additional acoustic score files
# (used by rescore-acoustic and nbest-rover)
#
# Setting the "max_nbest" limits the number of hyps retrieved from each
# input score list.
# If max_nbest is set and an additional score file contains less values
# than the nbest list is long, missing values are filled in with the
# minimal score found in that file.
#
# $Header: /home/srilm/CVS/srilm/utils/src/combine-acoustic-scores.gawk,v 1.9 2019/02/22 20:55:10 stolcke Exp $
#
function get_from_file(i) {
if (ARGV[i] ~ /\.gz$/) {
status = (("exec gzip -dc " ARGV[i]) | getline);
} else {
status = (getline < ARGV[i]);
}
if (status < 0) {
print "error reading from " ARGV[i] >> "/dev/stderr";
exit 1;
}
return status;
}
BEGIN {
hypno = 0;
sentid = ARGV[1];
sub(".*/", "", sentid);
sub("\\.gz$", "", sentid);
sub("\\.score$", "", sentid);
bytelogscale = 1024.0 / 10000.5 / 2.30258509299404568402;
nweights = split(weights, weight);
if (nweights != ARGC - 1) {
print "number of weights doesn't match number of score files" \
>> "/dev/stderr";
exit 1;
}
# format of input nbest list
nbestformat = 0;
while ((max_nbest == 0 || hypno < max_nbest) && get_from_file(1)) {
if ($1 == "NBestList1.0") {
nbestformat = 1;
print;
continue;
} else if ($1 == "NBestList2.0") {
nbestformat = 2;
print;
continue;
}
old_ac = $1; $1 = "";
if (nbestformat > 0) {
# Decipher nbest format: just use the aggregate
# score as the acoustic score
# For version 2 format, the total score is updated,
# reflecting the change in acoustic scores.
# Other programs recover the acoustic score as the
# difference of the total score and the accumulated
# LM scores, so this gives the right results.
gsub("[()]", "", old_ac);
old_ac *= bytelogscale;
}
hyp = $0;
total_ac = weight[1] * old_ac;
for (i = 2; i < ARGC; i ++) {
if (!get_from_file(i)) {
if (max_nbest == 0) {
print "missing score in " ARGV[i] \
>> "/dev/stderr";
exit 2
} else {
new_ac = min_score[i];
}
} else {
# skip nbest header
if ($1 ~ /NBestList/) {
i --;
continue;
}
new_ac = $1;
# handle decipher-style scores
if (new_ac ~ /\(.*\)/) {
gsub("[()]", "", new_ac);
new_ac *= bytelogscale;
}
# replace minimum score if needed
if (!(i in min_score) || $1 < min_score[i]) {
min_score[i] = new_ac;
}
}
total_ac += weight[i] * new_ac;
}
if (nbestformat > 0) {
total_ac = sprintf("(%f)", total_ac / bytelogscale);
}
print total_ac hyp;
hypno ++;
}
}

View File

@@ -0,0 +1,163 @@
#!/usr/local/bin/gawk -f
#
# combine-rover-controls --
# combined several rover control files for system combination
# (may be used recursively)
#
# $Header: /home/srilm/CVS/srilm/utils/src/combine-rover-controls.gawk,v 1.7 2017/08/16 06:34:16 stolcke Exp $
#
function process_rover_control(file, weight, pscale) {
dir = file;
sub("/[^/]*$", "", dir);
if (file == dir) {
dir = "";
}
while ((status = (getline < file)) > 0) {
if (NF == 0) continue;
# skip comment line
if (/^##/) continue;
if (!keep_paths) {
# deal with relatve directories in rover-control file:
# prepend rover-control directory path
if ($1 !~ /^\// && dir != "") {
$1 = dir "/" $1;
}
}
if ($3 == "+") {
system_id = system_id $1 " " $2 " +\n";
} else {
nsystems += 1;
# handle missing lmw and wtw and system weights
if ($2 == "") $2 = 8;
if ($3 == "") $3 = 0;
if ($4 == "") $4 = 1;
# missing nbest depth limit
if ($5 == "") nbest_depth[nsystems] = 0;
else nbest_depth[nsystems] = $5;
# override posterior scale if specified
if (pscale) system_pscale[nsystems] = pscale;
else system_pscale[nsystems] = $6
system_id = system_id $1 " " $2 " " $3;
# see if this system has appeared before
if (system_id in system_index) {
# merge system weights
# ensuring weight tying spec is compatible
if ($4 == "=") {
if (system_weight[system_index[system_id]] != "=") {
print "cannot combine weight tying" > "/dev/stderr";
exit(1);
}
} else {
if (system_weight[system_index[system_id]] == "=") {
print "cannot combine weight tying" > "/dev/stderr";
exit(1);
}
system_weight[system_index[system_id]] += $4 * weight;
}
# skip the duplicate system
nsystems -= 1;
} else {
# divide system weight by total number of input files
# but preserve weight tying info
if ($4 == "=") {
system_weight[nsystems] = $4;
} else {
system_weight[nsystems] = $4 * weight;
}
system_dirs_weights[nsystems] = system_id;
system_index[system_id] = nsystems;
}
system_id = "";
}
}
if (status < 0) {
print file ": " ERRNO > "/dev/stderr";
exit(1);
}
close(file);
return;
}
BEGIN {
arg_offset = 0;
ninputs = ARGC - 1;
nsystems = 0;
while (1) {
if (ARGV[arg_offset+1] ~ /^lambda=/) {
lambda = substr(ARGV[arg_offset+1], length("lambda")+2);
ninputs -= 1;
arg_offset += 1;
} else if (ARGV[arg_offset+1] ~ /^postscale=/) {
postscale = substr(ARGV[arg_offset+1], length("postscale")+2);
ninputs -= 1;
arg_offset += 1;
} else if (ARGV[arg_offset+1] ~ /^norm=/) {
norm_weights = substr(ARGV[arg_offset+1], length("norm")+2);
ninputs -= 1;
arg_offset += 1;
} else if (ARGV[arg_offset+1] ~ /^keeppaths=/) {
keep_paths = substr(ARGV[arg_offset+1], length("keeppaths")+2);
ninputs -= 1;
arg_offset += 1;
} else {
break;
}
}
if (ninputs < 1) {
print "usage: " ARGV[0] " [lambda=WEIGHTS] [postscale=S] ROVER-CTRL1 ROVER-CTRL2 ..." \
>> "/dev/stderr";
exit(2);
}
# initialize priors from lambdas
nlambdas = split(lambda, lambdas);
lambda_sum = 0.0;
for (i = 1; i <= nlambdas; i ++) {
lambda_sum += lambdas[i];
}
# fill in the missing lambdas with uniform values
for (i = nlambdas + 1; i <= ninputs; i ++) {
lambdas[i] = (1 - lambda_sum)/(ninputs - nlambdas);
}
for (i = 1; i <= ninputs; i ++) {
process_rover_control(ARGV[arg_offset + i], lambdas[i], postscale);
}
if (norm_weights) {
weight_sum = 0;
for (i = 1; i <= nsystems; i ++) {
weight_sum += system_weight[i];
}
for (i = 1; i <= nsystems; i ++) {
system_weight[i] /= weight_sum;
}
}
for (i = 1; i <= nsystems; i ++) {
print system_dirs_weights[i], system_weight[i], nbest_depth[i], system_pscale[i];
}
exit(0);
}

View File

@@ -0,0 +1,92 @@
#!/usr/local/bin/gawk -f
#
# compare-ppls --
# Compare two LMs for significant differences in probabilities
# The probabilities calculated for the test set words are ranked
# pairwise, as appropriate for submitting the result a sign test.
#
# usage: compare-ppls [mindelta=d] pplout1 pplout2
#
# where pplout1, pplout2 is the output of ngram -debug 2 -ppl for the two
# models. d is the minimum difference of logprobs for two probs to
# be considered different.
#
# $Header: /home/srilm/CVS/srilm/utils/src/compare-ppls.gawk,v 1.6 2014-07-03 05:57:09 stolcke Exp $
#
function abs(x) {
return (x < 0) ? -x : x;
}
BEGIN {
sampleA_no = 0;
sampleB_no = 0;
mindelta = 0;
verbose = 0;
signif = 0;
diff_sum = 0;
diff_squared_sum = 0;
logINF = -100000;
}
FNR == 1 {
if (!readingA) {
readingA = 1;
} else {
readingA = 0;
}
}
readingA && $1 == "p(" {
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) prob = logINF;
else prob = $10;
sampleA[sampleA_no ++] = prob;
}
!readingA && $1 == "p(" {
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) prob = logINF;
else prob = $10;
if (sampleB_no > sampleA_no) {
printf "sample B contains more data than sample A" >> "/dev/stderr";
exit(1);
}
diff = sampleA[sampleB_no] - prob;
if (abs(diff) <= mindelta) {
equal ++;
} else {
diff_sum += diff;
diff_squared_sum += diff * diff;
if (diff < 0) {
if (verbose) {
print;
}
greater ++;
}
}
sampleB_no ++;
}
END {
if (sampleB_no < sampleA_no) {
printf "sample B contains less data than sample A" >> "/dev/stderr";
print sampleB_no, sampleA_no;
exit(1);
}
mean_diff = diff_sum / sampleA_no;
mean_sq_error = diff_squared_sum / sampleA_no - mean_diff * mean_diff;
stdev = sqrt(mean_sq_error);
printf "total %d, equal %d, different %d, greater %d\n", \
sampleB_no, equal, sampleB_no - equal, greater;
printf "meandiff %g, mse %g, stdev %g\n", \
mean_diff, mean_sq_error, stdev;
if (signif) {
printf "significance:\n";
less = sampleB_no - equal - greater;
system("cumbin " (less+greater) " " (less>greater ? less : greater));
}
}

View File

@@ -0,0 +1,131 @@
#!/bin/sh
#
# compare-sclite --
# compare sclite word error sentence-by-sentence
#
# $Header: /home/srilm/CVS/srilm/utils/src/compare-sclite,v 1.26 2017/08/12 05:48:34 stolcke Exp $
#
# enforce proper sorting order
LC_COLLATE=C
export LC_COLLATE
if [ $# -lt 3 ]; then
echo "usage: $0 [-v] -h1 hyps1 -h2 hyps2 -r refs [-S id-subset] [-M|-multiwords] [sclite-options ...]" >&2
echo " or $0 hyps1 hyps2 refs" >&2
exit 2
elif [ $# -eq 3 ]; then
# old syntax
hypsA=${1}
hypsB=${2}
refs=${3}
else
# parse arguments
while [ $# -gt 0 ]; do
case "$1" in
-r) refs=$2; shift ;;
-h1) hypsA=$2; shift ;;
-h2) hypsB=$2; shift ;;
-S) options="$options -S $2"; shift ;;
*) options="$options $1" ;;
esac
shift
done
fi
tmpdir=${TMPDIR-/tmp}
pralignA=pralignA$$
pralignB=pralignB$$
subset="$tmpdir/subset$$"
trap '/bin/rm -f $tmpdir/$pralignA.pra $tmpdir/$pralignB.pra $subset.*' 0 1 2 13 15
set -e
#
# use the intersection of the too hyp sets and (if specified) the -S set
#
case "$hypsA" in
*.ctm) case "$hypsB" in
*.ctm) ${GAWK-gawk} '{ print $1 "_" $2 }' < "$hypsA" | sort -u > $subset.A
${GAWK-gawk} '{ print $1 "_" $2 }' < "$hypsB" | sort -u > $subset.B
;;
*) echo "both hyps must be in same format" >&2
exit 2
;;
esac
;;
*) case "$hypsB" in
*.ctm) echo "both hyps must be in same format" >&2
exit 2
;;
*) ${GAWK-gawk} '{ print $1 }' < "$hypsA" | sort -u > $subset.A
${GAWK-gawk} '{ print $1 }' < "$hypsB" | sort -u > $subset.B
;;
esac
;;
esac
comm -12 $subset.A $subset.B > $subset.AB
options="$options -S $subset.AB"
#
# generate alignments for the two hyp sets
#
compute-sclite -h "$hypsA" -r "$refs" $options -O $tmpdir -n $pralignA -o pralign
compute-sclite -h "$hypsB" -r "$refs" $options -O $tmpdir -n $pralignB -o pralign
#
# compute error totals by utterance and compare
#
${GAWK-gawk} '
BEGIN {
less = greater = equal = 0;
}
$1 == "id:" {
sentid = $2;
sub("^\\(", "", sentid);
sub("\\)$", "", sentid);
next;
}
$1 == "Scores:" {
corr = $6;
subs = $7;
dels = $8;
inss = $9;
words = corr + subs + dels;
errs = subs + dels + inss;
if (errors[sentid] == "") {
errors[sentid] = errs;
total_wordsA += words;
total_errsA += errs
total_sentsA ++;
} else {
if (errs > errors[sentid]) greater++;
else if (errs < errors[sentid]) less++;
else equal++;
total_wordsB += words;
total_errsB += errs;
total_sentsB ++;
}
next;
}
END {
werA = (total_wordsA > 0 ? total_errsA/total_wordsA * 100 : 0);
werB = (total_wordsB > 0 ? total_errsB/total_wordsB * 100 : 0);
printf "result 1: %d errors (%.2f%%), %d words, %d sentences\n", \
total_errsA, werA, total_wordsA, total_sentsA;
printf "result 2: %d errors (%.2f%%), %d words, %d sentences\n", \
total_errsB, werB, total_wordsB, total_sentsB;
printf "less %d, greater %d, equal %d, different %d (%+.2f%%)\n", \
less, greater, equal, less + greater, werB - werA;
if (less + greater > 0) {
printf "significance:\n"
system("cumbin " (less+greater) " " (less>greater ? less : greater));
}
}
' $tmpdir/$pralignA.pra $tmpdir/$pralignB.pra

View File

@@ -0,0 +1,181 @@
#!/usr/local/bin/gawk -f
#
# compute-best-mix --
# Compute the best mixture weight (-lambda) for interpolating N
# LMs.
#
# usage: compute-best-mix [lambda="l1 l2 ..."] [precision=p] pplout1 pplout2 ...
#j
# where pplout1, pplout2, ... is the output of ngram -debug 2 -ppl for the
# models. li are initial guesses at the mixture weights, and p is the
# precision with which the best lambda vector is to be found.
#
# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-mix.gawk,v 1.13 2017/12/22 01:34:49 stolcke Exp $
#
BEGIN {
verbose = 0;
lambda = "0.5";
precision = 0.001;
M_LN10 = 2.30258509299404568402; # from <math.h>
logINF = -320;
unk = "<unk>";
}
function abs(x) {
return (x < 0) ? -x : x;
}
function log10(x) {
return log(x) / M_LN10;
}
function exp10(x) {
if (x < logINF) {
return 0;
} else {
return exp(x * M_LN10);
}
}
function addlogs(x,y) {
if (x<y) {
temp = x; x = y; y = temp;
}
return x + log10(1 + exp10(y - x));
}
function print_vector(x, n) {
result = "(" x[1];
for (k = 2; k <= n; k++) {
result = result " " x[k];
}
return result ")"
}
function print_vector_pairwise(x, n) {
total_lambda = x[1];
result = "(" 1;
for (k = 2; k <= n; k++) {
total_lambda += x[k];
result = result " " x[k]/total_lambda;
}
return result ")"
}
FNR == 1 {
nfiles ++;
}
$1 == "p(" {
word = $2;
# Canonicalize input to have at most one representative context word;
sub("[|] [^)]*)", "| X )");
$0 = $0;
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) {
prob = logINF;
} else {
prob = $10;
}
# If a count is given.
if ($11 ~ /^[*]/) {
count = substr($11,2);
} else {
count = 1;
}
sample_no = ++ nsamples[nfiles];
samples[nfiles " " sample_no] = prob;
counts[sample_no] = count;
if (sample_no in words) {
if (word != words[sample_no] && word != unk && words[sample_no] != unk) {
print "warning: word mismatch in file " FILENAME ", token " sample_no \
": " word " != " words[sample_no] > "/dev/stderr";
}
} else {
words[sample_no] = word;
}
}
END {
for (i = 2; i <= nfiles; i ++) {
if (nsamples[i] != nsamples[1]) {
printf "mismatch in number of samples (%d != %d)", \
nsamples[1], nsamples[i] >> "/dev/stderr";
exit(1);
}
}
last_prior = 0.0;
# initialize priors from lambdas
nlambdas = split(lambda, lambdas);
lambda_sum = 0.0;
for (i = 1; i <= nlambdas; i ++) {
priors[i] = lambdas[i];
lambda_sum += lambdas[i];
}
# fill in the missing lambdas
for (i = nlambdas + 1; i <= nfiles; i ++) {
priors[i] = (1 - lambda_sum)/(nfiles - nlambdas);
}
iter = 0;
have_converged = 0;
while (!have_converged) {
iter ++;
num_oovs = num_words = 0;
delete post_totals;
log_like = 0;
for (j = 1; j <= nsamples[1]; j ++) {
all_inf = 1;
for (i = 1; i <= nfiles; i ++) {
sample = samples[i " " j];
logpost[i] = log10(priors[i]) + sample;
all_inf = all_inf && (sample == logINF);
if (i == 1) {
logsum = logpost[i];
} else {
logsum = addlogs(logsum, logpost[i]);
}
}
# skip OOV words
if (all_inf) {
num_oovs += counts[j];
continue;
}
num_words += counts[j];
log_like += logsum * counts[j];
for (i = 1; i <= nfiles; i ++) {
post_totals[i] += exp10(logpost[i] - logsum) * counts[j];
}
}
printf "iteration %d, lambda = %s, ppl = %g\n", \
iter, print_vector(priors, nfiles), \
exp10(-log_like/num_words) >> "/dev/stderr";
fflush();
have_converged = 1;
for (i = 1; i <= nfiles; i ++) {
last_prior = priors[i];
priors[i] = post_totals[i]/num_words;
if (abs(last_prior - priors[i]) > precision) {
have_converged = 0;
}
}
}
printf "%d non-oov words, best lambda %s\n",
num_words, print_vector(priors, nfiles);
printf "pairwise cumulative lambda %s\n",
print_vector_pairwise(priors, nfiles);
}

View File

@@ -0,0 +1,166 @@
#!/usr/local/bin/gawk -f
#
# compute-best-rover-mix --
# Compute the best mixture weight for combining multiple sausages
#
# usage: compute-best-rover-mix [lambda="l1 l2 ..."] [addone=N] [precision=p] nbest-rover-ref-posteriors-output
#
# where the input is the output of nbest-rover -write-ref-posteriors .
# li are initial guesses at the mixture weights, and p is the
# precision with which the best lambda vector is to be found.
#
# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-rover-mix.gawk,v 1.6 2016-12-10 07:06:41 stolcke Exp $
#
BEGIN {
verbose = 0;
lambda = "0.5";
addone = 0;
precision = 0.001;
M_LN10 = 2.30258509299404568402; # from <math.h>
logINF = -320;
zero_probs = 0;
}
function abs(x) {
return (x < 0) ? -x : x;
}
function log10(x) {
return log(x) / M_LN10;
}
function exp10(x) {
if (x < logINF) {
return 0;
} else {
return exp(x * M_LN10);
}
}
function addlogs(x,y) {
if (x<y) {
temp = x; x = y; y = temp;
}
return x + log10(1 + exp10(y - x));
}
function print_vector(x, n) {
result = x[1];
for (k = 2; k <= n; k++) {
result = result " " x[k];
}
return result;
}
{
nsystems = NF - 4;
if ($4 == 0) {
zero_probs ++;
} else {
sample_no ++;
for (i = 1; i <= nsystems; i++) {
samples[i " " sample_no] = $(i + 4);
}
}
}
END {
last_prior = 0.0;
# initialize priors from lambdas
nlambdas = split(lambda, lambdas);
lambda_sum = 0.0;
for (i = 1; i <= nlambdas; i ++) {
priors[i] = lambdas[i];
lambda_sum += lambdas[i];
}
# fill in the missing lambdas
for (i = nlambdas + 1; i <= nsystems; i ++) {
priors[i] = (1 - lambda_sum)/(nsystems - nlambdas);
}
# set up weight tying - assign input systems (weights) to tying bins
if (tying) {
ntying = split(tying, tying_bins);
for (i = 1; i <= ntying && i <= nsystems; i ++) {
this_bin = int(tying_bins[i]);
if (this_bin <= 0) {
print "invalid tying bin: " tying_bins[i];
exit(1);
}
binfor[i] = this_bin;
weights_in_bin[this_bin] += 1;
if (this_bin > nbins) nbins = this_bin;
}
} else {
i = 1;
nbins = 0;
}
# assign unique bins for weights not covered in tying argument string
for ( ; i <= nsystems; i ++) {
binfor[i] = ++nbins;
weights_in_bin[nbins] = 1;
}
iter = 0;
have_converged = 0;
while (!have_converged) {
iter ++;
num_words = 0;
delete post_totals;
log_like = 0;
for (j = 1; j <= sample_no; j ++) {
all_inf = 1;
for (i = 1; i <= nsystems; i ++) {
sample = log10(samples[i " " j]);
logpost[i] = log10(priors[i]) + sample;
all_inf = all_inf && (sample == logINF);
if (i == 1) {
logsum = logpost[i];
} else {
logsum = addlogs(logsum, logpost[i]);
}
}
# skip OOV words
if (all_inf) {
continue;
}
num_words ++;
log_like += logsum;
# total up the posteriors for each weight bin
for (i = 1; i <= nsystems; i ++) {
post_totals[binfor[i]] += exp10(logpost[i] - logsum);
}
}
printf "iteration %d, lambda = %s, ppl = %g\n", \
iter, print_vector(priors, nsystems), \
exp10(-log_like/num_words) >> "/dev/stderr";
fflush();
have_converged = 1;
for (i = 1; i <= nsystems; i ++) {
last_prior = priors[i];
priors[i] = (post_totals[binfor[i]]/weights_in_bin[binfor[i]] + addone)/(num_words + nsystems * addone);
if (abs(last_prior - priors[i]) > precision) {
have_converged = 0;
}
}
}
weights = print_vector(priors, nsystems);
printf "%d alignment positions, best lambda (%s)\n", num_words, weights;
if (write_weights) {
print weights > write_weights;
}
}

View File

@@ -0,0 +1,159 @@
#!/usr/local/bin/gawk -f
#
# compute-best-sentence-mix --
# Compute the best sentence-level mixture weight for interpolating N
# LMs.
#
# usage: compute-best-sentence-mix [lambda="l1 l2 ..."] [addone=N] [precision=p] pplout1 pplout2 ...
#
# where pplout1, pplout2, ... is the output of ngram -debug 1 -ppl for the
# models. li are initial guesses at the mixture weights, and p is the
# precision with which the best lambda vector is to be found.
#
# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-sentence-mix.gawk,v 1.4 2016/06/01 20:20:38 stolcke Exp $
#
BEGIN {
verbose = 0;
lambda = "0.5";
addone = 0;
precision = 0.001;
M_LN10 = 2.30258509299404568402; # from <math.h>
logINF = -320;
}
function abs(x) {
return (x < 0) ? -x : x;
}
function log10(x) {
return log(x) / M_LN10;
}
function exp10(x) {
if (x < logINF) {
return 0;
} else {
return exp(x * M_LN10);
}
}
function addlogs(x,y) {
if (x<y) {
temp = x; x = y; y = temp;
}
return x + log10(1 + exp10(y - x));
}
function print_vector(x, n) {
result = "(" x[1];
for (k = 2; k <= n; k++) {
result = result " " x[k];
}
return result ")"
}
FNR == 1 {
nfiles ++;
num_words = 0;
num_sentences = 0;
}
# 1 sentences, 6 words, 0 OOVs
/^1 sentences, [0-9]* words, [0-9]* OOVs/ {
# exclude OOVs
num_words += $3 - $5;
expect_logprob = 1;
}
# 0 zeroprobs, logprob= -22.9257 ppl= 1884.06 ppl1= 6621.32
/^[0-9]* zeroprobs, logprob= / && expect_logprob {
# exclude zero prob words
num_words -= $1;
num_sentences += 1;
if ($4 ~ /-[Ii]nf|-1\.#INF/) {
prob = logINF;
} else {
prob = $4;
}
sample_no = ++ nsamples[nfiles];
samples[nfiles " " sample_no] = prob;
expect_logprob = 0;
}
END {
for (i = 2; i <= nfiles; i ++) {
if (nsamples[i] != nsamples[1]) {
printf "mismatch in number of samples (%d != %d)", \
nsamples[1], nsamples[i] >> "/dev/stderr";
exit(1);
}
}
last_prior = 0.0;
# initialize priors from lambdas
nlambdas = split(lambda, lambdas);
lambda_sum = 0.0;
for (i = 1; i <= nlambdas; i ++) {
priors[i] = lambdas[i];
lambda_sum += lambdas[i];
}
# fill in the missing lambdas
for (i = nlambdas + 1; i <= nfiles; i ++) {
priors[i] = (1 - lambda_sum)/(nfiles - nlambdas);
}
iter = 0;
have_converged = 0;
while (!have_converged) {
iter ++;
delete post_totals;
log_like = 0;
for (j = 1; j <= nsamples[1]; j ++) {
all_inf = 1;
for (i = 1; i <= nfiles; i ++) {
sample = samples[i " " j];
logpost[i] = log10(priors[i]) + sample;
all_inf = all_inf && (sample == logINF);
if (i == 1) {
logsum = logpost[i];
} else {
logsum = addlogs(logsum, logpost[i]);
}
}
# skip OOV words
if (all_inf) {
continue;
}
log_like += logsum;
for (i = 1; i <= nfiles; i ++) {
post_totals[i] += exp10(logpost[i] - logsum);
}
}
printf "iteration %d, lambda = %s, ppl = %g\n", \
iter, print_vector(priors, nfiles), \
exp10(-log_like/(num_words + num_sentences)) \
>> "/dev/stderr";
fflush();
have_converged = 1;
for (i = 1; i <= nfiles; i ++) {
last_prior = priors[i];
priors[i] = (post_totals[i] + addone)/(num_sentences + nfiles * addone);
if (abs(last_prior - priors[i]) > precision) {
have_converged = 0;
}
}
}
printf "%d sentences, %d non-oov words, best lambda %s\n",
num_sentences, num_words, print_vector(priors, nfiles);
}

View File

@@ -0,0 +1,81 @@
#!/usr/local/bin/gawk -f
#
# compute-oov-rate --
# Compute OOV word rate from a vocabulary and a unigram count file
#
# usage: compute-oov-rate vocab countfile ...
#
# Assumes unigram counts do not have repeated words.
#
# $Header: /home/srilm/CVS/srilm/utils/src/compute-oov-rate.gawk,v 1.10 2018/01/24 03:35:38 stolcke Exp $
#
BEGIN {
# high bit characters also detect multibyte characters
letter = "[[:alpha:]\x80-\xFF]";
if ("x" !~ letter) letter = "[A-Za-z\x80-\xFF]";
}
# Read vocab
#
ARGIND == 1 {
vocab[$1] = 1;
}
function is_fragment(word) {
return word ~ (letter "-$") || word ~ ("^-" letter);
}
#
# Read counts
#
ARGIND > 1 {
if ($1 == "<s>" || $1 == "</s>" || $1 == "-pau-") {
next;
}
total_count += $2;
total_types ++;
if (!vocab[$1]) {
oov_count += $2;
oov_types ++;
if (debug) {
print "OOV: " $1, $2 > "/dev/stderr";
}
if (!is_fragment($1)) {
if (write_oov_words) {
print > write_oov_words;
}
} else {
if (write_oov_frags) {
print > write_oov_frags;
}
}
}
if (!is_fragment($1)) {
total_nofrag_count += $2;
total_nofrag_types ++;
if (!vocab[$1]) {
oov_nofrag_count += $2;
oov_nofrag_types ++;
}
}
}
END {
printf "OOV tokens: %d / %d (%.2f%%) ", \
oov_count, total_count, total_count == 0 ? 0 : 100 * oov_count/total_count;
printf "excluding fragments: %d / %d (%.2f%%)\n", \
oov_nofrag_count, total_nofrag_count, \
total_nofrag_count == 0 ? 0 : 100 * oov_nofrag_count/total_nofrag_count;
printf "OOV types: %d / %d (%.2f%%) ", \
oov_types, total_types, total_types == 0 ? 0 : 100 * oov_types/total_types;
printf "excluding fragments: %d / %d (%.2f%%)\n", \
oov_nofrag_types, total_nofrag_types, \
total_nofrag_types == 0 ? 0 : 100 * oov_nofrag_types/total_nofrag_types;
}

View File

@@ -0,0 +1,252 @@
#!/bin/sh
#
# compute-sclite --
# compute word error rate from a sentid hyp file and a sentid reference
# file, using the NIST 'sclite' program
#
# $Header: /home/srilm/CVS/srilm/utils/src/compute-sclite,v 1.49 2016/09/23 20:05:51 stolcke Exp $
#
# enforce proper sorting order
LC_COLLATE=C
export LC_COLLATE
reject="@reject@"
sclite=sclite
subsets=
remove_periods=
format_sentids=1
if [ $# -lt 2 ]; then
echo "usage: $0 [-v] -h hyps -r refs [-S id-subset] [-M|-multiwords] [-noperiods] [-g glm-file] [sclite-options ...]" >&2
echo " or $0 hyps refs" >&2
exit 2
elif [ $# -eq 2 ]; then
# old syntax
hyps=${1}
refs=${2}
else
# parse arguments
while [ $# -gt 0 ]; do
case "$1" in
-v) verbose=1 ;;
-r) refs=$2; shift ;;
-h) hyps="$hyps $2"
name=`basename $2`
shift ;;
-S) subsets="$subsets $2"; shift ;;
-M|-multiwords)
multiwords=1 ;;
-noperiods)
remove_periods=1 ;;
-H) remove_hesitations=1 ;;
-keep_bracketed)
keep_bracketed=1 ;;
-R) reject="<>" ;;
-g) glmfile=$2; shift ;;
-s) case_sensitive=1 ;;
-overlap-limit)
options="$options $1 $2"
sclite=asclite
shift;;
-raw-sentids)
format_sentids=0
;;
*) options="$options $1" ;;
esac
shift
done
fi
if [ -n "$case_sensitive" ]; then
filter_options="-s";
options="$options -s";
fi
tmpdir=${TMPDIR-/tmp}
sentids="$tmpdir/ce.sentids$$"
speakers="$tmpdir/ce.speakers$$"
sortedrefs="$tmpdir/ce.refs$$"
sortedhyps="$tmpdir/ce.hyps$$"
ignorehyps="$tmpdir/ce.ign$$"
if [ -z "$verbose" ]; then
trap '/bin/rm -f $sentids $speakers $sortedrefs $sortedhyps $ignorehyps' \
0 1 2 13 15
fi
set -e
multijoin () {
if [ $# -eq 1 ]; then
cat $1
else
join $1 $2 | { shift; shift; multijoin - "$@"; }
fi
}
#
# extract and sort sentids from hyps
# (for CTM hyps these are just waveform/channel labels)
#
case "$hyps" in
*.ctm)
cat $hyps | \
${GAWK-gawk} '!/^;;/ && $7 != "non-lex" && $7 != "fp" { print $1 "_" $2 }' ;;
*) cat $hyps | ${GAWK-gawk} '{ print $1 }' ;;
esac | \
sort | \
multijoin - $subsets > $sentids
#
# extract list of "speakers" (waveform/channel labels)
#
case "$hyps" in
*.ctm)
cat $sentids | uniq | tr '[A-Z]' '[a-z]' | sort > $speakers
;;
*) sed 's,\([-_][ABab12]\)[-_].*,\1,' $sentids | uniq | \
tr '[A-Z]' '[a-z]' | sort > $speakers
;;
esac
#
# extract and sort refs for these sentids
#
case "$refs" in
*.stm) # NIST scoring:
# filter out speakers not occurring in hyp file
${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \
sort -k 1,1 -k 5,5n | \
join - $speakers | \
${GAWK-gawk} '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \
if [ -n "$glmfile" ]; then
${GAWK-gawk} '{ gsub("-","_",$1); gsub("-","_",$3); print }' | \
csrfilt.sh $filter_options -i stm -t ref -dh $glmfile
else
cat
fi > $sortedrefs
;;
*.stm.filt) # NIST scoring with pre-filtered references
# filter out speakers not occurring in hyp file
${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \
sort -k 1,1 -k 5,5n | \
join - $speakers | \
${GAWK-gawk} '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \
if [ -n "$glmfile" ]; then
${GAWK-gawk} '{ gsub("-","_",$1); gsub("-","_",$3); print }'
else
cat
fi > $sortedrefs
;;
*) sort "$refs" | join - $sentids | \
${GAWK-gawk} '{ if (multiwords) for (i = 2; i <= NF; i++) \
gsub("_", " ", $i); print }'\
multiwords=$multiwords | \
sed -e 's,\[[^]]*\],,g' | \
sentid-to-sclite format_sentids=$format_sentids | \
if [ -n "$glmfile" ]; then
csrfilt.sh $filter_options -i trn -t hyp -dh $glmfile
else
cat
fi > $sortedrefs
# find segments to ignore
${GAWK-gawk} 'NF == 2 && tolower($2) == "ignore_time_segment_in_scoring" \
{ print $1 }' < $refs | \
sort > $ignorehyps
;;
esac
if [ ! -s $sortedrefs ]; then
echo "Filtered references are empty" >&2
exit 1
fi
#
# sort and condition hyps
#
case "$refs" in
*.stm|*.stm.filt) # NIST scoring
# sclite will handle ignored segments
case "$hyps" in
*.ctm)
cat $hyps | ${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' | \
sort -b -k 1,1 -k 2,2 -k 3,3 -k 4,4n | join - $speakers | \
${GAWK-gawk} '{ $1 = ""; print }' ;;
*) sort -k 1,1 $hyps | join - $sentids | sentid-to-ctm ;;
esac | \
${GAWK-gawk} '{ # handle new-style CTM format (convert it to old format)
if (NF >= 7) {
if ($7 != "lex") next;
else $7 = $8 = "";
}
if (remove_periods) gsub("[.]", "", $5);
print;
}' remove_periods=$remove_periods | \
if [ -n "$glmfile" ]; then
${GAWK-gawk} '{ gsub("-","_",$1); print }' | \
csrfilt.sh $filter_options -i ctm -t hyp -dh $glmfile | \
if [ -n "$remove_hesitations" ]; then
grep -vi '%HESITATION'
else
cat
fi
else
cat
fi > $sortedhyps
;;
*) # we have to remove ignored segments ourselves
sort -k 1,1 $hyps | join - $sentids | join -v 1 - $ignorehyps | \
${GAWK-gawk} '{ if (multiwords) for (i = 2; i <= NF; i++) gsub("_", " ", $i);
if (remove_periods) for (i = 2; i <= NF; i++) gsub("[.]", "", $i);
print }'\
remove_periods=$remove_periods multiwords=$multiwords | \
sed -e 's,\[[^]]*\],,g' \
-e "s,$reject,,g" \
-e 's,-pau-,,g' | \
if [ -z "$keep_bracketed" ]; then
sed -e 's,<[^>]*>,,g'
else
cat
fi |\
sentid-to-sclite format_sentids=$format_sentids |\
if [ -n "$glmfile" ]; then
csrfilt.sh $filter_options -i trn -t hyp -dh $glmfile | \
if [ -n "$remove_hesitations" ]; then
sed -e 's/\%HESITATION//g' -e 's/\%hesitation//g'
else
cat
fi
else
cat
fi > $sortedhyps
;;
esac
if [ ! -s $sortedhyps ]; then
echo "Filtered hypotheses are empty" >&2
exit 1
fi
[ "$verbose" ] && set -x
case $sclite in
sclite) options="-n $name $options" ;;
esac
case "$refs" in
*.stm|*.stm.filt) # NIST scoring
$sclite -f 0 -O . \
-h $sortedhyps ctm $name -r $sortedrefs stm \
-D $options
;;
*) $sclite -f 0 -O . \
-h $sortedhyps trn $name -r $sortedrefs trn \
-i swb $options
;;
esac

View File

@@ -0,0 +1,153 @@
#!/bin/sh
#
# compute-sclite-nbest --
# Compute errors for nbest hypotheses using sclite
# for use with nbest-optimize -errors option
#
# $Header: /home/srilm/CVS/srilm/utils/src/compute-sclite-nbest,v 1.5 2016/09/23 20:05:51 stolcke Exp $
#
usage () {
echo "$0 nbest-files output-dir -r refs [-filter script] [sclite-options]"
}
if [ $# -lt 2 ]; then
usage;
exit 2
fi
filter=cat
nbest_files=$1
output_dir=$2
shift; shift
while [ $# -gt 0 ]
do
case "$1" in
-r) refs=$2
shift; shift
;;
-filter) filter="$2"
shift; shift
;;
*) sclite_options="$sclite_options $1"
shift
;;
esac
done
if [ -z "$refs" ]; then
usage
exit 2
fi
TMPDIR=${TMPDIR-/tmp}
sortedrefs=$TMPDIR/sortedrefs.$$
nbestrefs=$TMPDIR/nbestrefs.$$
nbesthyps=$TMPDIR/nbesthyps.$$
scliteout=$TMPDIR/scliteout.$$
trap "/bin/rm -f $sortedrefs $nbestrefs $nbesthyps $scliteout; exit 1" 1 2 13 15
set -e
sort -k 1,1 $refs > $sortedrefs
> $nbestrefs
> $nbesthyps
#
# Prepare hyp and reference files
#
cat $nbest_files | \
sed 's,.*/\(.*\).gz$,\1 &,' | \
sort -k 1,1 | \
join - $sortedrefs | \
while read sentid nbestlist refwords
do
if [ -z "$refwords" ]; then
echo "warning: $sentid has no reference" >&2
continue
fi
echo $sentid >&2
gunzip -cf $nbestlist | \
nbest-words | \
$filter | \
${GAWK-gawk} \
-v nbestrefs=$nbestrefs -v nbesthyps=$nbesthyps \
-v outdir=$output_dir \
-v sentid=$sentid -v refwords="$refwords" '{
if (refwords == "ignore_time_segment_in_scoring") {
# this utterance is to be ignored --
# we generate dummy error information directly
# nbest-optimize(1) error count format is: wcr wer nsub ndel nins nerr nw
print 0, 0, 0, 0, 0, 0, 0 | "gzip > " outdir "/" sentid ".gz";
} else {
gsub("<[^ ]*>", "");
gsub("-pau-", "");
hypid = sprintf("%s#%05d", sentid, NR);
print hypid, refwords >> nbestrefs;
print hypid, $0 >> nbesthyps;
}
}'
done
if [ -s $nbestrefs ]; then
#
# Run the scoring
#
(set -x; compute-sclite \
-raw-sentids \
$sclite_options \
-O $TMPDIR -l 1000 \
-r $nbestrefs \
-h $nbesthyps \
-o pralign )
#
# Extract error counts from sclite pra output
#
${GAWK-gawk} -v outdir=$output_dir '
$1 == "id:" {
sentid = $2;
sub("^\\(", "", sentid);
# strip the hyp number
sub("#[0-9]*)$", "", sentid);
# sclite lowercases sentids
# Heuristically restore channel letters to uppercase
sub("_a_", "_A_", sentid);
sub("_b_", "_B_", sentid);
sub("-a-", "-A-", sentid);
sub("-b-", "-B-", sentid);
if (sentid != last_sentid) {
if (outfile) close(outfile);
outfile = "gzip > " outdir "/" sentid ".gz"
last_sentid = sentid;
}
next;
}
$1 == "Scores:" {
corr = $6;
subs = $7;
dels = $8;
inss = $9;
words = corr + subs + dels;
errs = subs + dels + inss;
wer = words > 0 ? errs/words : 0;
# nbest-optimize(1) error count format is: wcr wer nsub ndel nins nerr nw
print words-dels-subs, wer, subs, dels, inss, errs, words | outfile;
}
END {
if (outfile) close(outfile);
}' $nbesthyps.pra
fi
/bin/rm -f $sortedrefs $nbestrefs $nbesthyps $nbesthyps.pra

View File

@@ -0,0 +1,152 @@
#!/usr/local/bin/gawk -f
#
# concat-sausages --
# concatenate a list of sausages into a single word confusion networks
#
# $Header: /home/srilm/CVS/srilm/utils/src/concat-sausages.gawk,v 1.1 2019/02/09 07:34:35 stolcke Exp $
#
# input format:
#
# name Speech012_apple-iphone-6s-agc_00001330_00010030
# numaligns 32
# posterior 1
# align 0 <s> 1
# info 0 <s> 1.33 0.06 0 0 : :
# align 1 OK 1
# info 1 OK 1.39 0.5 0 0 : :
# align 2 *DELETE* 1 I 3.110077054250103e-33 we 3.193624897980025e-52 i 7.615703946522299e-53
# info 2 I 1.83 0.06 0 0 : :
# info 2 we 1.85 0.06 0 0 : :
# info 2 i 1.83 0.06 0 0 : :
#
BEGIN {
name = "";
numaligns = 0;
posterior = 0;
if (posterior_factor == "") {
posterior_factor = 1;
}
sent_start = "<s>";
sent_end = "</s>";
epsilon = 1e-05;
}
function abs(x) {
return x < 0 ? -x : x;
}
function process_sausage(file, remove_start, remove_end) {
if (file ~ /.*\.gz$|.*\.Z/) {
input = "exec gunzip -c " file;
} else {
input = "exec cat " file;
}
while ((status = (input | getline)) > 0) {
if ($1 == "name") {
if (output_name != "") {
name = output_name;
} else if (name == "") {
name = $2;
} else {
name = name "+" $2
}
} else if ($1 == "posterior") {
if (posterior != 0 && abs($2 - posterior) > epsilon) {
print file ": incompatible posterior: " $2 > "/dev/stderr"
exit(1);
} else {
posterior = $2;
# if (posterior_factor != 1) {
# posterior *= posterior_factor;
# }
}
} else if ($1 == "numaligns") {
# offset for renumbered alignments
start_alignment = numaligns;
} else if ($1 == "align") {
$2 = $2 + start_alignment;
if (posterior_factor != 1 && $3 != sent_start && $3 != sent_end) {
for (i = 4; i <= NF; i += 2) {
$i *= posterior_factor;
}
}
#
# remove alignment positions that are just for
# start/end sentence tags, if so desired
#
if (NF == 4 && $3 == sent_start && remove_start) {
start_alignment --;
;
} else if (NF == 4 && $3 == sent_end && remove_end) {
start_alignment --;
;
} else {
alignments[$2] = $0;
if ($2 + 1 > numaligns) {
numaligns = $2 + 1;
}
}
} else if ($1 == "info") {
$2 = $2 + start_alignment;
if (!($2 in info)) {
info[$2] = $0;
} else {
info[$2] = info[$2] "\n" $0;
}
} else if ($1 == "time") {
; # ignore
} else {
print file ": unknown keyword: " $1 > "/dev/stderr";
exit(1);
}
}
if (status < 0) {
print "error opening " file >> "/dev/stderr";
}
close(input);
}
function output_sausage() {
print "name", name;
print "numaligns", numaligns;
print "posterior", posterior;
for (i = 0; i < numaligns; i ++) {
if (i in alignments) {
print alignments[i];
if (i in info) {
print info[i];
}
}
}
}
BEGIN {
if (ARGC < 2) {
print "usage: " ARGV[0] " SAUSAGE1 SAUSAGE2 ..." \
>> "/dev/stderr";
exit(2);
}
for (arg = 1; arg < ARGC; arg ++) {
process_sausage(ARGV[arg], arg > 1, arg < ARGC-1);
}
output_sausage();
}

View File

@@ -0,0 +1,13 @@
#!/usr/local/bin/gawk -f
#
# context-ngrams --
# Extract counts corresponding to ngram contexts
#
# $Header: /home/srilm/CVS/srilm/utils/src/context-ngrams.gawk,v 1.1 2008/09/30 03:54:05 stolcke Exp $
#
NF > 2 {
$(NF-1) = "";
print $0;
}

View File

@@ -0,0 +1,35 @@
#!/usr/local/bin/gawk -f
#
# continuous-ngram-count --
# Generate ngram counts ignoring line breaks
#
# usage: continous-ngram-count order=ORDER textfile | ngram-count -read -
#
# $Header: /home/srilm/CVS/srilm/utils/src/continuous-ngram-count.gawk,v 1.1 1998/08/24 00:52:30 stolcke Exp $
#
BEGIN {
order = 3;
head = 0; # next position in ring buffer
}
function process_word(w) {
buffer[head] = w;
ngram = "";
for (j = 0; j < order; j ++) {
w1 = buffer[(head + order - j) % order];
if (w1 == "") {
break;
}
ngram = w1 " " ngram;
print ngram 1;
}
head = (head + 1) % order;
}
{
for (i = 1; i <= NF; i ++) {
process_word($i);
}
}

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env perl
# This tool calculates probability over the tail of a binomial
# distribution. The calculations is done directly, without using any
# approximations.
#
# This program is in the public domain. It was written
# by Brett Kessler and David Gelbart.
#
use warnings;
use strict;
use POSIX;
if (@ARGV != 2 && @ARGV != 3) {
die "Usage: $0 n k [p]\n";
}
my $n = $ARGV[0];
my $k = $ARGV[1];
my $p = $ARGV[2];
if (!(defined $p)) {
$p = 0.5;
}
if (($n - $k) > $k) {
die "Did you choose the right value of k?\n";
}
my $P = tailBinomial($n, $k, $p);
print "One-tailed: P(k >= ${k} | n=${n}, p=${p}) = ${P}\n";
$P = 2 * $P;
print "Two-tailed: 2*P(k >= ${k} | n=${n}, p=${p}) = ${P}\n";
# Calculate the sum over the tail of the binomial probability distribution.
sub tailBinomial {
my($N, $k, $p) = @_;
my $sum = 0;
for (my $i = $k; $i <= $N; $i++) {
$sum += exp(logBinomial($N, $i, $p));
}
$sum;
}
# We use logarithms during calculation to avoid overflow during the
# calculation of factorials and underflow during the calculation of
# powers of probabilities. This function calculates the log of
# binomial probability for given N, k, p.
sub logBinomial {
my($N, $k, $p) = @_;
my $q = 1 - $p;
# These safety checks were inspired by the code at
# http://faculty.vassar.edu/lowry/binomialX.html
die "Error: N not integer" if ($N != floor($N));
die "Error: k not integer" if ($k != floor($k));
die "Error: k > N" if ($k > $N);
die "Error: p > 1" if ($p > 1);
die "Error: N < 1" if ($N < 1);
logBinomCoeff($N, $k) + $k * log($p) + ($N - $k) * log($q);
}
# Calculcate the log of the binomial coefficient for given N and k.
sub logBinomCoeff {
my($N, $k) = @_;
logFactorial($N) - logFactorial($k) - logFactorial($N - $k);
}
# Calculate the log of the factorial of the argument.
sub logFactorial {
my($N) = @_;
my $prod = 0;
for (my $i = 2; $i <= $N; $i++) {
$prod += log($i);
}
$prod;
}

View File

@@ -0,0 +1,89 @@
#!/usr/local/bin/gawk -f
#
# de-vq-lm --
# Expand parameters in a quantized ARPA backoff LM
#
# usage: de-vq-lm bins=CW lm-file > sub-lm-file
#
# where CW defines the quantization bins.
#
# Copyright (c) 2012 Andreas Stolcke, Microsoft Corp. All Rights Reserved.
#
# $Header: /home/srilm/CVS/srilm/utils/src/de-vq-lm.gawk,v 1.2 2019/09/09 23:13:15 stolcke Exp $
#
BEGIN {
bins = "/dev/null";
}
# read the cw file
#
#VQSize 256
#Codeword Mean Count
# 0 -12.7330028909195 10454
# 1 -12.3314038288506 1494
# etc.
#
NR == 1 {
saveline = $0;
getline < bins;
if ($1 != "VQSize") {
print "file " bins " is not a VQ file" > "/dev/stderr";
exit(1);
}
vqsize = $2;
getline < bins;
if ($1 != "Codeword") {
print "file " bins " is not a VQ file" > "/dev/stderr";
exit(1);
}
while ((getline < bins) > 0) {
vqbin[$1] = $2;
}
close(bins);
$0 = saveline;
}
NF==0 {
print; next;
}
/^ngram *[0-9][0-9]*=/ {
order = substr($2,1,index($2,"=")-1);
print; next;
}
/^\\[0-9]-grams:/ {
currorder=substr($0,2,1);
print; next;
}
/^\\/ {
print; next;
}
#
# replace VQ index with value in ngram parameter lines
#
currorder {
if (!($1 in vqbin)) {
print "line: " NR ": VQ bin #" $1 "is undefined" > "/dev/stderr";
exit(1);
}
$1 = vqbin[$1];
# backoff weight, if any
if (NF == currorder + 2) {
if (!($NF in vqbin)) {
print "line: " NR ": VQ bin #" $NF "is undefined" > "/dev/stderr";
exit(1);
}
$NF = vqbin[$NF];
}
print; next;
}
# pass through anything else
{ print }

View File

@@ -0,0 +1,79 @@
#!/bin/sh
#
# empty-sentence-lm --
# modify language model to allow the empty sentence.
# This adds a "<s> </s>" bigram to the model and scales the
# probabilities of other bigrams starting with <s>.
# probabilities. Backoff weights are recomputed.
#
# usage: empty-sentence-lm -prob P -lm oldlm -write-lm newlm
#
# $Header: /home/srilm/CVS/srilm/utils/src/empty-sentence-lm,v 1.5 2013/03/09 07:13:01 stolcke Exp $
#
oldlm=-
newlm=-
prob=0.1
vocab=/dev/null
norm_option=-renorm
while [ $# -gt 0 ]; do
case "$1" in
-prob) prob="$2" ; shift ;;
-lm) oldlm="$2" ; shift ;;
-write-lm) newlm="$2" ; shift ;;
-nonorm) norm_option= ; shift ;;
*) options="$options $1" ;;
esac
shift
done
gzip -dcf $oldlm | ${GAWK-gawk} '
function log10(x) {
return log(x)/2.30258509299404568402;
}
/^ngram 2=/ {
num = substr($2, 3);
print "ngram 2=" num + 1;
next;
}
#
# add empty-sentence bigram
#
/^\\2-grams:/ {
print;
print log10(prob), "<s> </s>";
in_ngrams = 2;
next;
}
#
# ensure that <s> has backoff weight and
# approximately adjust it (correct adjustment done by ngram -renorm)
#
in_ngrams == 1 && $2 == "<s>" {
$3 += log10(1-prob);
}
#
# scale bigram probs starting with <s>
#
in_ngrams == 2 && $2 == "<s>" {
$1 += log10(1-prob);
}
/^\\1-grams:/ {
in_ngrams = 1;
}
/^\\3-grams:/ {
in_ngrams = 3;
}
{
print;
}' prob=$prob | \
ngram -lm - $norm_option -write-lm "$newlm" $options

View File

@@ -0,0 +1,17 @@
#!/usr/local/bin/gawk -f
#
# extract-skip-probs --
# Extract the skip probabilities from a Skip-Ngram model
#
# $Header: /home/srilm/CVS/srilm/utils/src/extract-skip-probs.gawk,v 1.1 1996/05/20 21:22:09 stolcke Exp $
#
NF == 0 {
next;
}
/\\end\\/ {
end_seen = 1;
next;
}
end_seen {
printf "%s %f\n", $1, $2;
}

View File

@@ -0,0 +1,44 @@
#!/usr/local/bin/gawk -f
#
# filter-event-counts --
# Remove from a count file all ngrams that don't correspond to an "event"
# for the LM, such that
#
# ngram -order N -lm LM -ppl TEXT
# and
# ngram-count -order N -text TEXT -write - | \
# filter-event-counts order=N | \
# ngram -order N -lm LM -counts -
#
# yield the same result.
#
# $Header: /home/srilm/CVS/srilm/utils/src/filter-event-counts.gawk,v 1.2 2009/09/25 00:06:50 stolcke Exp $
#
BEGIN {
order = 3;
escape = "";
sent_start = "<s>";
}
# pass escaped lines through
escape != "" && substr($0, 1, length(escape)) == escape {
print;
next;
}
# Start-of-sentence ngrams are always included (except for <s> unigram)
$1 == sent_start {
if (NF == 2) {
next;
} else {
print;
next;
}
}
# ngrams of highest order
NF == order + 1 {
print;
}

View File

@@ -0,0 +1,89 @@
#!/usr/local/bin/gawk -f
#
# find-reference-posteriors --
# tabular the sausage posteriors of reference words
#
# usage: find-reference-posteriors posteriors_files=NBEST_POSTERIORS SAUSAGE
#
# $Header: /home/srilm/CVS/srilm/utils/src/find-reference-posteriors.gawk,v 1.4 2010/08/20 00:17:18 stolcke Exp $
#
BEGIN {
sentid = "UNKNOWN";
M_LN10 = 2.30258509299404568402; # from <math.h>
logINF = -320;
}
function log10(x) {
return log(x) / M_LN10;
}
function exp10(x) {
if (x < logINF) {
return 0;
} else {
return exp(x * M_LN10);
}
}
function addlogs(x,y) {
if (x<y) {
temp = x; x = y; y = temp;
}
return x + log10(1 + exp10(y - x));
}
NR == 1 {
if (posteriors_file) {
hypno = 0;
num_sources = 0;
while ((("gzip -dcf " posteriors_file) | getline pline) > 0) {
if (split(pline, a) == 3) {
hyp_source[hypno] = a[1];
if (a[1] > num_sources) {
num_sources = a[1];
}
hyp_posterior[hypno] = a[3];
hypno ++;
}
}
print "read " hypno " posteriors from " num_sources " sources" \
>> "/dev/stderr";
}
}
# input format:
# align 1 hello 0.988212 below 0.00481234 low 0.00331215 ...
# reference 1 hello
# hyps 1 hello 0 1 2 3 4 5 6 7 8 9 10 11 16 17 18 19
$1 == "align" {
position = $2;
delete word_posteriors;
for (i = 3; i <= NF; i +=2 ) {
word_posteriors[$i] = $(i + 1);
}
}
$1 == "reference" && $2 == position {
refword = $3;
}
$1 == "hyps" && $2 == position && $3 == refword {
for (i = 1; i <= num_sources; i ++) {
posterior_sum[i] = logINF;
}
for (i = 4; i <= NF; i ++) {
posterior_sum[hyp_source[$i]] = \
addlogs(posterior_sum[hyp_source[$i]], hyp_posterior[$i]);
}
printf "%s %d %s %g", sentid, position, refword, \
word_posteriors[refword];
for (i = 1; i <= num_sources; i ++) {
printf " %g", exp10(posterior_sum[i]);
}
printf "\n";
}

View File

@@ -0,0 +1,153 @@
#!/usr/local/bin/gawk -f
#
# Post-process CTM files output by lattice-tool -output-ctm to
# use global conversation-relative time marks and channel ids.
# (This requires that the waveform names conform to our standard
# formats, the same as in sentid-to-ctm.)
#
# $Header: /home/srilm/CVS/srilm/utils/src/fix-ctm.gawk,v 1.10 2019/02/09 07:30:11 stolcke Exp $
#
BEGIN {
# time to add to word start times (should be about half FE window size)
phase_shift = 0.01;
tag_pat = "^<.*>$";
htk_tag_pat = "^null|^!sent_start|^!sent_end";
noise_pat = "^\\[.*\\]$";
fragment_pat = "-$";
pause = "-pau-";
channel_letters = 0;
# hesitations (best deleted for NIST scoring;
# should be kept in sync with GLM filter file)
hesitation["uh"] = 1;
hesitation["um"] = 1;
hesitation["eh"] = 1;
hesitation["mm"] = 1;
hesitation["hm"] = 1;
hesitation["ah"] = 1;
hesitation["huh"] = 1;
hesitation["ha"] = 1;
hesitation["er"] = 1;
hesitation["oof"] = 1;
hesitation["hee"] = 1;
hesitation["ach"] = 1;
hesitation["eee"] = 1;
hesitation["ew"] = 1;
parse_sentids = 1;
orig_times = 0; # DON'T preserve original times
sort_cmd = "sort -b -k 1,1 -k 2,2 -k 3,3n";
}
{
sentid = $1;
start_time = $3;
duration = $4;
word = $5;
confidence = $6;
# HTK stuff: strip quotes
sub("\"", "", sentid);
sub("\"", "", sentid);
# archive aliasing info
sub("=.*\\[.*\\]$", "", sentid);
# standard input file suffixes.
sub("\\.plp$", "", sentid);
sub("\\.wav$", "", sentid);
sub("\\.sph$", "", sentid);
if (sentid == last_sentid && start_time == "?") {
start_time = last_end_time;
duration = 0;
}
# exclude sentence start/end tags
if (word ~ tag_pat) next;
if (tolower(word) ~ htk_tag_pat) next;
if (sentid == last_sentid) {
if (start_time <= last_start_time) {
new_start_time = last_start_time + .01;
print "warning: " sentid ": word \"" word "\" start time " start_time " " \
(start_time < last_start_time ? "is less than" : "equals") \
" previous word -- adjusting to " new_start_time > "/dev/stderr";
start_time = new_start_time;
}
}
if (!parse_sentids) {
conv = sentid;
channel = $2;
start_offset = 0;
} else if (match(sentid, "_[0-9]_[-0-9][0-9]*_[0-9][0-9]*$")) {
# waveforms with [012] channel id, timemarks 1/1000s
# NOTE: this form is used by the segmenter
conv = substr(sentid, 1, RSTART-1);
split(substr(sentid, RSTART+1), sentid_parts, "_");
channel = sentid_parts[1];
start_offset = sentid_parts[2] / 1000;
end_offset = sentid_parts[3] / 1000;
} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*$")) {
conv = substr(sentid, 1, RSTART-1);
split(substr(sentid, RSTART+1), sentid_parts, "_");
channel = sentid_parts[1];
start_offset = sentid_parts[2] / 100;
end_offset = sentid_parts[3] / 100;
} else {
print "cannot parse sentid " sentid >> "/dev/stderr";
conv = sentid;
channel = 1;
start_offset = 0;
end_offset = 10000;
}
if (orig_times) {
start_offset = 0;
}
if (channel_letters && channel ~ /^[0-9]/) {
channel = sprintf("%c", 64+channel);
}
speaker_id = conv "_" channel;
ncomps = split(word, word_comps, "_");
for (j = 1; j <= ncomps; j ++) {
this_word = word_comps[j];
if (this_word == pause) {
next;
} else if (this_word in hesitation) {
word_type = "fp";
} else if (this_word ~ fragment_pat) {
word_type = "frag";
} else if (this_word ~ noise_pat) {
word_type = "non-lex";
} else {
word_type = "lex";
}
printf "%s %s %.2f %.2f %s %g %s %s\n", \
conv, channel, \
start_offset + start_time + phase_shift + \
(j - 1) * duration/ncomps,\
duration/ncomps, \
this_word, \
confidence, \
word_type, \
(word_type == "non-lex" ? \
"null" : speaker_id) \
| sort_cmd;
}
last_start_time = start_time;
last_end_time = start_time + duration;
last_sentid = sentid;
}

View File

@@ -0,0 +1,158 @@
#!/usr/local/bin/gawk -f
#
# fsm-to-pfsg --
# convert AT&T FSM acceptor to Decipher PFSG format
#
# usage: fsm-to-pfsg [pfsg_name=NAME] [transducer=1] [scale=S] file.fsm > file.pfsg
# pfsg_name=NAME sets PFSG name to NAME
# transducer=1 indicates input is a transducer
# scale=S sets transition weight scaling factor to S
# (default -1)
#
# $Header: /home/srilm/CVS/srilm/utils/src/fsm-to-pfsg.gawk,v 1.10 2015-07-03 03:45:38 stolcke Exp $
#
BEGIN {
pfsg_name = "from_fsm";
transducer = 0; # input is transducer
if ("TMPDIR" in ENVIRON) {
tmpdir = ENVIRON["TMPDIR"];
} else {
tmpdir = "/tmp"
}
if ("pid" in PROCINFO) {
pid = PROCINFO["pid"];
} else {
getline pid < "/dev/pid";
}
tmpfile = tmpdir "/fsm.tmp" pid;
# hack to remove tmpfile when killed
trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
print "" | trap_cmd;
num_newnodes = 0;
initial_node = -1;
empty_output = "NULL";
epsilon = "<eps>"; # FSM epsilon symbol
map_epsilon = ""; # map epsilon to this symbol
scale = -1; # scaling of transition weights
}
# transition description
NF >= 3 {
from_node = $1;
to_node = $2;
if (map_epsilon && $3 == epsilon) $3 = map_epsilon;
if (transducer) {
if (map_epsilon && $4 == epsilon) $4 = map_epsilon;
# collapse input and output into a single symbol
$3 = $3 ":" $4;
$4 = "";
}
output = $3;
if (initial_node < 0) {
initial_node = from_node;
}
# create new node names for pairs of output,old-node
if (!(output " " to_node in newnode_table)) {
output_table[num_newnodes] = output;
newnode_table[output " " to_node] = num_newnodes ++;
# create list of incoming outputs for each state
insymbols[to_node] = insymbols[to_node] " " output;
}
# save for re-reading
print $0 > tmpfile;
next;
}
# final state description
NF >= 1 {
node = $1;
if (initial_node < 0) {
initial_node = node;
}
# save for re-reading
print $0 > tmpfile;
next;
}
END {
close(tmpfile);
# create initial and final nodes
if (!(empty_output " " initial_node in newnode_table)) {
output_table[num_newnodes] = empty_output;
newnode_table[empty_output " " initial_node] = num_newnodes ++;
insymbols[initial_node] = insymbols[initial_node] " " empty_output;
}
initial_newnode = newnode_table[empty_output " " initial_node];
output_table[num_newnodes] = empty_output;
final_newnode = num_newnodes++;
# print PFSG header info
print "name " pfsg_name;
printf "nodes %d", num_newnodes;
for (i = 0; i < num_newnodes; i ++) {
printf " %s", output_table[i];
}
printf "\n";
printf "initial %d\n", initial_newnode;
printf "final %d\n", final_newnode;
# re-read FSM description, counting total number of new
# transitions
num_transitions = 0;
while (getline < tmpfile) {
from_node = $1;
# duplicate transition for all insymbols of from_node
num_transitions += split(insymbols[from_node], a);
}
close(tmpfile);
printf "transitions %d\n", num_transitions;
# re-read FSM description, outputing new transitions
while (getline < tmpfile) {
if (NF >= 3) {
from_node = $1;
to_node = $2;
output = $3;
cost = (NF == 3 ? 0 : $4);
# duplicate transition for all insymbols of from_node
n = split(insymbols[from_node], a);
for (i = 1; i <= n; i ++) {
printf "%d %d %d\n", \
newnode_table[a[i] " " from_node], \
newnode_table[output " " to_node], \
scale * cost;
}
} else {
from_node = $1;
cost = (NF == 1 ? 0 : $2);
# add final transition for all insymbols of from_node
n = split(insymbols[from_node], a);
for (i = 1; i <= n; i ++) {
printf "%d %d %d\n", \
newnode_table[a[i] " " from_node], \
final_newnode, \
scale * cost;
}
}
}
}

View File

@@ -0,0 +1,39 @@
#!/usr/local/bin/gawk -f
#
# get-gt-counts --
# generate the counts-of-counts required for Good-Turing discounting
# assumes the ngrams in the input contain no repetitions
#
# usage: get-gt-counts max=<number> out=<name> file ...
#
# $Header: /home/srilm/CVS/srilm/utils/src/get-gt-counts.gawk,v 1.5 2016-01-07 17:19:21 stolcke Exp $
#
BEGIN {
max = 10
maxorder = 9;
}
{
total[NF - 1] ++;
}
NF > 1 && $NF <= max {
counts[(NF - 1), $NF] ++;
}
END {
for (order = 1; order <= maxorder; order++) {
if (total[order] > 0) {
if (out) {
outfile = out ".gt" order "counts";
} else {
outfile = "/dev/stdout";
}
for (i = 0; i <= max; i ++) {
c = counts[order, i];
print i, c ? c : "0" > outfile;
}
print "total", total[order] > outfile;
if (out) close(outfile);
}
}
}

View File

@@ -0,0 +1,38 @@
#!/usr/local/bin/gawk -f
#
# get-unigram-probs --
# extract unigram probabilities from backoff LM file
#
# usage: get-unigram-probs bo-file
#
# $Header: /home/srilm/CVS/srilm/utils/src/get-unigram-probs.gawk,v 1.3 2018/06/28 07:45:08 stolcke Exp $
#
BEGIN {
linear = 0;
currorder = 0;
logzero = -99;
}
/^\\[0-9]-grams:/ {
currorder = substr($0,2,1);
next;
}
/^\\/ {
currorder = 0;
next;
}
currorder == 1 && NF > 0 {
if (NF < 2) {
print "line " NR ": missing word" > "/dev/stderr";
} else if (linear) {
print $2, $1 == logzero ? 0 : 10^$1;
} else {
print $2, $1 == logzero ? "-infinity" : $1;
}
next;
}

View File

@@ -0,0 +1,79 @@
#!/usr/local/bin/gawk -f
#
# hits-from-log --
# Computes n-gram hit ratios frrom the output of
#
# ngram -debug 2 -ppl
#
# This is useful if one wants to analyse predictability of certain
# words/contexts.
#
# Copyright (c) 1995, SRI International. All Rights Reserved
#
# $Header: /home/srilm/CVS/srilm/utils/src/hits-from-log.gawk,v 1.3 1995/10/28 03:59:31 stolcke Exp $
#
BEGIN {
M_LN10 = 2.30258509299404568402; # from <math.h>
}
/6gram/ {
words ++;
hits[6] ++;
next;
}
/5gram/ {
words ++;
hits[5] ++;
next;
}
/4gram/ {
words ++;
hits[4] ++;
next;
}
/3gram/ {
words ++;
hits[3] ++;
next;
}
/3\+Tgram/ {
words ++;
thits[3] ++;
next;
}
/2gram/ {
words ++;
hits[2] ++;
next;
}
/2\+Tgram/ {
words ++;
thits[2] ++;
next;
}
/1gram/ {
words ++;
hits[1] ++;
next;
}
/1\+Tgram/ {
words ++;
thits[1] ++;
next;
}
{
next;
}
END {
printf "%d words, hit rates:\n", words;
for (i = 1; i <= 6; i++) {
if (hits[i]) {
printf "%dgrams: %d (%.1f%%) ", i, hits[i], \
(hits[i]/words * 100);
}
if (thits[i]) {
printf "%d+Tgrams: %d (%.1f%%) ", i, thits[i], \
(thits[i]/words * 100);
}
}
printf "\n";
}

View File

@@ -0,0 +1,50 @@
#!/usr/local/bin/gawk -f
#
# htklat-vocab --
# extract vocabulary used in an HTK lattice
#
# usage: htklat-vocab HTK-LATTICE ... > VOCAB
#
# $Header: /home/srilm/CVS/srilm/utils/src/htklat-vocab.gawk,v 1.3 2004/02/27 21:42:28 stolcke Exp $
#
BEGIN {
null = "!NULL";
quotes = 0;
}
{
for (i = 1; i <= NF; i ++) {
# skip comments
if ($i ~ /^#/) next;
# Note: this doesn't handle quoted spaces
# (as SRILM generally doesn't)
if ($i ~ /^W=/ || $i ~ /^WORD=/) {
word = substr($i, index($i, "=") + 1);
if (quotes) {
# HTK quoting conventions
if (word ~ /^['"]/) {
word = substr(word, 2, length(word)-2);
}
if (word ~ /\\/) {
gsub(/\\\\/, "@QuOtE@", word);
gsub(/\\/, "", word);
gsub(/@QuOtE@/, "\\", word);
}
}
if (word != null) {
is_word[word] = 1;
}
}
}
}
END {
for (word in is_word) {
print word;
}
}

View File

@@ -0,0 +1,14 @@
#!/usr/local/bin/gawk -f
#
# Test for classname heuristic used in add-pauses-to-pfsg.gawk
#
# $Header: /home/srilm/CVS/srilm/utils/src/isclassname.gawk,v 1.1 2007/10/19 04:16:25 stolcke Exp $
#
function is_classname(w) {
return w ~ /^\*.*\*$/ || !(w ~ /[[:lower:]]/ || w ~ /[^\x00-\x7F]/);
}
{
print $1 " is " (!is_classname($1) ? "not " : "") "a class name";
}

View File

@@ -0,0 +1,31 @@
#!/usr/local/bin/gawk -f
#
# log10-to-bytelog --
# convert log-base-10 scores to bytelog
#
# $Header: /home/srilm/CVS/srilm/utils/src/log10-to-bytelog.gawk,v 1.1 1997/04/22 20:20:41 stolcke Exp $
#
BEGIN {
logscale = 2.30258509299404568402 * 10000.5 / 1024.0;
scale = 1;
round = 0.5;
}
function rint(x) {
if (x < 0) {
return int(x - round);
} else {
return int(x + round);
}
}
{
for (i = 1; i <= NF; i ++) {
if ($i ~ /^[-+.0-9][.0-9]*$/) {
if (round) {
$i = scale * rint($i * logscale);
} else {
$i = scale * $i * logscale;
}
}
}
print;
}

View File

@@ -0,0 +1,30 @@
#!/usr/local/bin/gawk -f
#
# make-abs-discount --
# computes the absolute (constant) discount values from Good-Turing
# counts-of-counts statistics. (Only the n1 and n2 statistics are used.)
#
# usage: make-abs-discount COUNTFILE
#
# where COUNTFILE was created with get-gt-counts.
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-abs-discount.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
#
$1 == 1 {
gt1count = $2;
}
$1 == 2 {
gt2count = $2;
}
END {
if (gt1count == 0) {
print "n1 count is zero" >> "/dev/stderr";
exit 1;
}
if (gt2count == 0) {
print "n2 count is zero" >> "/dev/stderr";
exit 1;
}
print gt1count/(gt1count + 2 * gt2count);
}

View File

@@ -0,0 +1,112 @@
#!/bin/sh
#
# make-batch-counts --
# generate n-gram counts in batches
#
# A list of data files is partitioned into batches, results from each of
# which are deposited in a separate ngram-count file.
#
# usage: make-batch-count file-list [batch-size [filter \
# [countdir [options]]]]
#
# file-list is a file containing a list of data files
# (lines starting with # are ignored)
# batch-size is the number of input files per batch
# filter is preprocessor filter to condition the data
# countdir is the directory where count files are deposited
# options are arguments passed on to ngram-count
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-batch-counts,v 1.8 2013/03/19 18:37:52 stolcke Exp $
#
if [ $# -lt 1 ]; then
echo "usage: $0 file-list [batch-size [filter [countdir [options]]]]" >&2
exit 2
fi
filelist=$1
batchsize=${2-10}
filter=${3-/bin/cat}
countdir=${4-./counts}
case $# in
1) shift;;
2) shift; shift;;
3) shift; shift; shift;;
4) shift; shift; shift; shift;;
esac
options="$@"
what=`basename $filelist .files`
statsfile=$countdir/$what.stats
infiles=$countdir/$what.files
set -e
if [ ! -d $countdir ]; then
mkdir $countdir
fi
trap 'rm -f $newfile $test_in $test_out; exit 1' 1 2 15
# determine if ngram-count can generate compressed files
test_in=$countdir/testin
test_out=$countdir/testout.gz
echo x > $test_in
ngram-count -text $test_in -write $test_out
if gzip -l $test_out >/dev/null 2>&1; then
gz=.gz
else
gz=
fi
rm $test_in $test_out
> $statsfile
#
# format filelist into one batch per line, preceded by line number
#
${GAWK-gawk} -v batchsize=$batchsize \
'BEGIN {
batchno = 1;
}
/^#/ || NF == 0 {
next;
}
{
files = files " " $0;
numfiles += 1;
if (numfiles >= batchsize) {
print batchno, files;
files = "";
numfiles = 0;
batchno += 1;
}
}
END {
if (numfiles > 0) {
print batchno, files;
}
}' $filelist | \
while read fileno datafiles; do
newfile=$countdir/$what-$fileno.ngrams$gz
# avoid including $datafiles on command line to avoid length limit
cat <<EOF >&2
counting in $newfile sources $datafiles
EOF
echo $datafiles | \
xargs $filter | \
ngram-count -text - \
-tag $newfile \
-sort \
-write-order 0 \
-write $newfile \
$options \
2>> $statsfile
done

View File

@@ -0,0 +1,276 @@
#!/bin/sh
#
# make-big-lm --
# Create a large ngram language model
#
# This script automates various techniques for building large ngram models.
# It is useful for building LMs that would exceed available real memory
# if built in one pass by ngram-count.
# The techiques employed are
# - Assume counts are already produced
# (typically using make-batch-counts/merge-batch-counts)
# - Compute Good Turing discounts without loading all counts
# into memory.
# - ngram-counts loads only those counts exceeding cutoff values.
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-big-lm,v 1.25 2015-05-27 08:10:52 stolcke Exp $
#
name=biglm
order=3
gt1min=1
gt2min=1
gt3min=2
gt4min=2
gt5min=2
gt6min=2
gt7min=2
gt8min=2
gt9min=2
gt1max=7
gt2max=7
gt3max=7
gt4max=7
gt5max=7
gt6max=7
gt7max=7
gt8max=7
gt9max=7
kndiscount1=0
kndiscount2=0
kndiscount3=0
kndiscount4=0
kndiscount5=0
kndiscount6=0
kndiscount7=0
kndiscount8=0
kndiscount9=0
ukndiscount1=0
ukndiscount2=0
ukndiscount3=0
ukndiscount4=0
ukndiscount5=0
ukndiscount6=0
ukndiscount7=0
ukndiscount8=0
ukndiscount9=0
using_kn=
max_per_file=10000000
ngram_filter=cat
subset_filter=cat
counts=
test_data=
trust_totals=0
metatag=__meta__ # lowercase so it works with ngram-count -tolower
# avoid locale problems with gawk script computing discounting parameters
LC_NUMERIC=C; export LC_NUMERIC
while [ $# -gt 0 ]; do
case "$1" in
-name) name=$2; shift ;;
-order) order=$2 ; shift ;;
-gt1min) gt1min=$2; options="$options $1 $2" ; shift ;;
-gt2min) gt2min=$2; options="$options $1 $2" ; shift ;;
-gt3min) gt3min=$2; options="$options $1 $2" ; shift ;;
-gt4min) gt4min=$2; options="$options $1 $2" ; shift ;;
-gt5min) gt5min=$2; options="$options $1 $2" ; shift ;;
-gt6min) gt6min=$2; options="$options $1 $2" ; shift ;;
-gt7min) gt7min=$2; options="$options $1 $2" ; shift ;;
-gt8min) gt8min=$2; options="$options $1 $2" ; shift ;;
-gt9min) gt9min=$2; options="$options $1 $2" ; shift ;;
-gt1max) gt1max=$2; using_gt=1; shift ;;
-gt2max) gt2max=$2; using_gt=1; shift ;;
-gt3max) gt3max=$2; using_gt=1; shift ;;
-gt4max) gt4max=$2; using_gt=1; shift ;;
-gt5max) gt5max=$2; using_gt=1; shift ;;
-gt6max) gt6max=$2; using_gt=1; shift ;;
-gt7max) gt7max=$2; using_gt=1; shift ;;
-gt8max) gt8max=$2; using_gt=1; shift ;;
-gt9max) gt9max=$2; using_gt=1; shift ;;
-kndiscount1) kndiscount1=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount2) kndiscount2=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount3) kndiscount3=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount4) kndiscount4=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount5) kndiscount5=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount6) kndiscount6=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount7) kndiscount7=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount8) kndiscount8=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount9) kndiscount9=1; ukndiscount1=1; using_kn=1 ;;
-kndiscount) kndiscount1=1; kndiscount2=1; kndiscount3=1;
kndiscount4=1; kndiscount5=1; kndiscount6=1;
kndiscount7=1; kndiscount8=1; kndiscount9=1;
using_kn=1 ;;
-ukndiscount1) kndiscount1=1; using_kn=1 ;;
-ukndiscount2) kndiscount2=1; using_kn=1 ;;
-ukndiscount3) kndiscount3=1; using_kn=1 ;;
-ukndiscount4) kndiscount4=1; using_kn=1 ;;
-ukndiscount5) kndiscount5=1; using_kn=1 ;;
-ukndiscount6) kndiscount6=1; using_kn=1 ;;
-ukndiscount7) kndiscount7=1; using_kn=1 ;;
-ukndiscount8) kndiscount8=1; using_kn=1 ;;
-ukndiscount9) kndiscount9=1; using_kn=1 ;;
-ukndiscount) kndiscount1=1; kndiscount2=1; kndiscount3=1;
kndiscount4=1; kndiscount5=1; kndiscount6=1;
kndiscount7=1; kndiscount8=1; kndiscount9=1;
ukndiscount1=1; ukndiscount2=1; ukndiscount3=1;
ukndiscount4=1; ukndiscount5=1; ukndiscount6=1;
ukndiscount7=1; ukndiscount8=1; ukndiscount9=1;
using_kn=1 ;;
-wbdiscount) using_wb=1 ;;
-wbdiscount*|-cdiscount*|-ndiscount*|-addsmooth*)
echo "$0: must use one of GT, KN, UKN, or WB discounting for all orders" >&2
exit 2 ;;
-read) if [ "$2" = "" -o "$2" = - -o "$2" = "/dev/stdin" ]; then
echo "$0: cannot read from stdin" >&2
exit 2
fi
counts="$counts $2" ; shift ;;
-trust-totals) trust_totals=1 ;;
-max-per-file) max_per_file=$2 ; shift ;;
-ngram-filter) ngram_filter="$2" ; shift ;;
-text) test_data="$2"; shift ;;
*) options="$options $1" ;;
esac
shift
done
if [ -z "$counts" ]; then
echo "No counts specified" >&2
echo "usage: $0 -read COUNTS [-name PATH] [-text TESTSET] [-ngram-filter FILTER] [-max-per-file N] [ngram-count-options ...]" >&2
exit 2
fi
if [ -n "$using_gt" -a -n "$using_kn" -o \
-n "$using_gt" -a -n "$using_wb" -o \
-n "$using_kn" -a -n "$using_wb" ]
then
echo "$0: cannot mix GT, KN, and WB discounting" >&2
exit 2
fi
if [ $trust_totals -eq 0 ]; then
options="$options -meta-tag $metatag"
else
if [ "$using_kn" ]; then
echo "$0: -trust-totals incompatible with KN discounting; ignoring it" >&2
options="$options -meta-tag $metatag"
else
options="$options -trust-totals"
fi
fi
set -e
#
# if KN smoothing is used, compute the modified lower-order counts
#
if [ "$using_kn" ]; then
kncounts=$name.kncounts.gz
if [ -f $kncounts ]; then
echo "using existing $kncounts" >&2
elif [ $order -eq 1 ]; then
# create a dummy empty file
gzip -f < /dev/null > $kncounts
else
mkdir -p $name.kndir
gzip -dcf $counts | \
eval "$ngram_filter" | \
(set -x; make-kn-counts \
no_max_order=1 max_per_file=$max_per_file \
order=$order \
kndiscount1=$kndiscount1 kndiscount2=$kndiscount2 \
kndiscount3=$kndiscount3 kndiscount4=$kndiscount4 \
kndiscount5=$kndiscount5 kndiscount6=$kndiscount6 \
kndiscount7=$kndiscount7 kndiscount8=$kndiscount8 \
kndiscount9=$kndiscount9 \
output=$name.kndir/kncounts)
(set -x; merge-batch-counts $name.kndir)
# this will fail if more than one count file is left in kndir,
# i.e., if merging didn't finish successfully
mv `find $name.kndir -name \*.ngrams.gz -print ` $kncounts
fi
options="$options -kn-counts-modified"
fi
#
# compute counts-of-counts
#
if [ "$using_wb" ]; then
:
elif [ -f $name.gt${order}counts ]; then
echo "using existing gtcounts" >&2
else
if [ "$using_kn" ]; then
# concatenate KN modified counts with highest-order original counts
# Note: even though $kncounts ends in .gz it might be a plain file
# if platform doesn't support gzip pipes, so use gzip -df .
gzip -dcf $kncounts | ${GAWK-gawk} 'NF < 1+'$order
gzip -dcf $counts | eval "$ngram_filter" | ${GAWK-gawk} 'NF == 1+'$order
else
gzip -dcf $counts | eval "$ngram_filter"
fi | (set -x; get-gt-counts out=$name max=20 maxorder=$order)
fi
#
# compute discount factors
#
if [ "$using_wb" ]; then
# apply WB discount to all ngram orders
gtflags=-wbdiscount
else
gtflags=
fi
for n in 1 2 3 4 5 6 7 8 9
do
if [ $n -le $order -a -f $name.gt${n}counts ]; then
if (set +e; eval [ \"\$ukndiscount${n}\" -eq 1 ]); then
gtflags="$gtflags -kn${n} $name.kn${n}"
eval make-kn-discounts modified=0 \
min=\$gt${n}min $name.gt${n}counts > $name.kn${n}
elif (set +e; eval [ \"\$kndiscount${n}\" -eq 1 ]); then
gtflags="$gtflags -kn${n} $name.kn${n}"
eval make-kn-discounts \
min=\$gt${n}min $name.gt${n}counts > $name.kn${n}
else
gtflags="$gtflags -gt${n} $name.gt${n}"
eval make-gt-discounts \
min=\$gt${n}min max=\$gt${n}max \
$name.gt${n}counts > $name.gt${n}
fi
fi
done
# if test data is specified compute context ngrams
if [ -n "$test_data" -a $order -gt 1 ]; then
order1=`expr $order - 1`
(set -x; \
ngram-count -order $order1 -text "$test_data" -sort -write $name.contexts)
# ... and filter the ngrams to contain only the required contexts
subset_filter="subset-context-ngrams contexts=$name.contexts"
fi
#
# filter counts and build lm
#
if [ "$using_kn" ]; then
# concatenate KN modified counts with highest-order original counts
# Note: even though $kncounts ends in .gz it might be a plain file
# if platform doesn't support gzip pipes, so use gzip -df .
gzip -dcf $kncounts | ${GAWK-gawk} 'NF < 1+'$order
gzip -dcf $counts | eval "$ngram_filter" | ${GAWK-gawk} 'NF == 1+'$order
else
gzip -dcf $counts | eval "$ngram_filter"
fi | \
eval "$subset_filter" | \
(set -x; \
ngram-count -read - -read-with-mincounts -order $order \
$gtflags \
$options)
rm -f $name.contexts

View File

@@ -0,0 +1,89 @@
#!/usr/local/bin/gawk -f
#
# make-diacritic-map --
# Generate a map from ascii to accented word forms
# for use with disambig(1)
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-diacritic-map.gawk,v 1.3 1998/02/04 20:28:02 stolcke Exp $
#
/^#/ {
next;
}
function asciify(word) {
gsub("<22>", "A", word);
gsub("<22>", "A", word);
gsub("<22>", "A", word);
gsub("<22>", "A", word);
gsub("<22>", "A", word);
gsub("<22>", "A", word);
gsub("<22>", "AE", word);
gsub("<22>", "C", word);
gsub("<22>", "E", word);
gsub("<22>", "E", word);
gsub("<22>", "E", word);
gsub("<22>", "E", word);
gsub("<22>", "I", word);
gsub("<22>", "I", word);
gsub("<22>", "I", word);
gsub("<22>", "I", word);
gsub("<22>", "N", word);
gsub("<22>", "O", word);
gsub("<22>", "O", word);
gsub("<22>", "O", word);
gsub("<22>", "O", word);
gsub("<22>", "O", word);
gsub("<22>", "O", word);
gsub("<22>", "U", word);
gsub("<22>", "U", word);
gsub("<22>", "U", word);
gsub("<22>", "U", word);
gsub("<22>", "Y", word);
gsub("<22>", "ss", word);
gsub("<22>", "a", word);
gsub("<22>", "a", word);
gsub("<22>", "a", word);
gsub("<22>", "a", word);
gsub("<22>", "a", word);
gsub("<22>", "a", word);
gsub("<22>", "a", word);
gsub("<22>", "c", word);
gsub("<22>", "e", word);
gsub("<22>", "e", word);
gsub("<22>", "e", word);
gsub("<22>", "e", word);
gsub("<22>", "i", word);
gsub("<22>", "i", word);
gsub("<22>", "i", word);
gsub("<22>", "i", word);
gsub("<22>", "n", word);
gsub("<22>", "o", word);
gsub("<22>", "o", word);
gsub("<22>", "o", word);
gsub("<22>", "o", word);
gsub("<22>", "o", word);
gsub("<22>", "u", word);
gsub("<22>", "u", word);
gsub("<22>", "u", word);
gsub("<22>", "u", word);
gsub("<22>", "y", word);
return word;
}
{
word = $1;
asciiword = asciify(word);
if (asciiword in map) {
map[asciiword] = map[asciiword] " " word;
} else {
map[asciiword] = word;
}
}
END {
print "<s>\t<s>"
print "</s>\t</s>"
fflush()
for (w in map) {
print w "\t" map[w] | "sort";
}
}

View File

@@ -0,0 +1,124 @@
#!/usr/local/bin/gawk -f
#
# make-google-ngrams --
# split ngram count file into an indexed directory structure
# compatible with the Google ngrams distributed by LDC
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-google-ngrams.gawk,v 1.6 2010/08/20 00:17:18 stolcke Exp $
#
# usage: zcat counts.gz | make-google-ngrams [dir=DIR] [per_file=N] [gzip=0] [yahoo=1]
#
# INPUT DATA is assumed to be a sorted ngram count file
#
#
# OUTPUT DATA FORMAT
#
# a) top-level directory
# doc: documentation
# data: data
# (the top-level structure is required by LDC)
# b) data directory
# one sub-directory per n-gram order: 1gms, 2gms, 3gms, 4gms, 5gms
# (separating the orders makes it easier for people to use smaller orders)
# c) contents of sub-directory 1gms
# - file 'vocab.gz' contains the vocabulary sorted by word in unix
# sort-order. Each word is on its own line:
# WORD <tab> COUNT
# - file 'vocab_cs.gz' contains the same data as 'vocab.gz' but
# sorted by count.
# (need to be 8+3 file names)
# d) contents of sub-directories 2gms, 3gms, 4gms, 5gms:
# - files 'Ngm-KKKK.gz' where N is the order of the n-grams
# and KKKK is the zero-padded number of the file. Each file contains
# 10 million n-gram entries. N-grams are unix-sorted. Each
# n-gram occupies one line:
# WORD1 <space> WORD2 <space> ... WORDN <tab> COUNT
# - file 'Ngm.idx' where N is the order of the n-grams, with one line for
# each n-gram file:
# FILENAME <tab> FIRST_NGRAM_IN_FILE
BEGIN {
dir = "data";
per_file = 10000000;
gzip = 1;
}
NR == 1 {
if (gzip) {
gzip_cmd = "gzip";
gzip_suff = ".gz";
} else {
gzip_cmd = "cat";
gzip_suff = "";
}
}
# determine ngram length
{
if (yahoo) {
order = NF - 5;
if (order > 0) {
$NF = $(NF-1) = $(NF-2) = $(NF-3) = "";
}
} else {
order = NF - 1;
}
}
#
# unigrams
#
order == 1 {
if (!have_dir[1]) {
system("mkdir -p " dir "/1gms");
have_dir[1] = 1;
output_file[1] = gzip_cmd " > " dir "/1gms/vocab" gzip_suff;
}
print | output_file[1];
next;
}
order > 1 {
if (output_ngram_count[order] == 0) {
output_ngram_count[order] = 1;
system("mkdir -p " dir "/" order "gms");
if (output_file[order]) close(output_file[order]);
output_name = sprintf("%dgm-%04d%s", order, output_file_count[order] ++, gzip_suff);
output_file[order] = gzip_cmd " > " dir "/" order "gms/" output_name;
ngram = $1;
for (i = 2; i <= order; i ++) {
ngram = ngram " " $i;
}
print output_name "\t" ngram > (dir "/" order "gms/" order "gm.idx");
}
print | output_file[order];
output_ngram_count[order] += 1;
output_ngram_count[order] %= (per_file + 1);
next;
}
order < 1 {
print FILENAME ": " FNR ": insufficient number of fields" > "/dev/stderr";
print $0 > "/dev/stderr";
exit(1);
}
#
# sort unigrams by count
#
END {
close(output_file[1]);
if (have_dir[1]) {
system("gzip -dcf " dir "/1gms/vocab" gzip_suff " | sort -k 2,2rn | " gzip_cmd " > " dir "/1gms/vocab_cs" gzip_suff);
}
}

View File

@@ -0,0 +1,76 @@
#!/usr/local/bin/gawk -f
#
# make-gt-discounts --
# generate Good-Turing discounting parameters from a count-of-count
# file
#
# The purpose of this script is to do the GT computation off-line,
# without ngram-count having to read all counts into memory.
# The output is compatible with the ngram-count -gt<n> options.
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-gt-discounts.gawk,v 1.3 2004/11/02 02:00:35 stolcke Exp $
#
# usage: make-gt-discounts min=<mincount> max=<maxcount> countfile
#
BEGIN {
min=1;
max=7;
}
/^#/ {
# skip comments
next;
}
{
countOfCounts[$1] = $2;
}
END {
# Code below is essentially identical to GoodTuring::estimate()
# (Discount.cc).
minCount = min;
maxCount = max;
if (!countOfCounts[1]) {
printf "warning: no singleton counts\n" >> "/dev/stderr";
maxCount = 0;
}
while (maxCount > 0 && countOfCounts[maxCount + 1] == 0) {
printf "warning: count of count %d is zero -- lowering maxcount\n", \
maxCount + 1 >> "/dev/stderr";
maxCount --;
}
if (maxCount <= 0) {
printf "GT discounting disabled\n" >> "/dev/stderr";
} else {
commonTerm = (maxCount + 1) * \
countOfCounts[maxCount + 1] / \
countOfCounts[1];
for (i = 1; i <= maxCount; i++) {
if (countOfCounts[i] == 0) {
printf "warning: count of count %d is zero\n", \
i >> "/dev/stderr";
coeff = 1.0;
} else {
coeff0 = (i + 1) * countOfCounts[i+1] / \
(i * countOfCounts[i]);
coeff = (coeff0 - commonTerm) / (1.0 - commonTerm);
if (coeff <= 0 || coeff0 > 1.0) {
printf "warning: discount coeff %d is out of range: %g\n", \
i, coeff >> "/dev/stderr";
coeff = 1.0;
}
}
discountCoeffs[i] = coeff;
}
}
printf "mincount %d\n", minCount;
printf "maxcount %d\n", maxCount;
for (i = 1; i <= maxCount; i++) {
printf "discount %d %g\n", i, discountCoeffs[i];
}
}

View File

@@ -0,0 +1,100 @@
#!/usr/local/bin/gawk -f
#
# make-hiddens-lm --
# Create a hidden-sentence-boundary ngram LM from a standard one
#
# This script edits a ARPA backoff model file as follows:
#
# 1 - ngrams involving <s> and </s> are duplicated using the
# hidden segment boundary token <#s>.
# 2 - ngrams starting with <s> are eliminated.
# 3 - the backoff weight of <s> is set to 1.
# this together with the previous change sets all probabilities conditioned
# on <s> to the respective marignal probabilities without <s>.
# 4 - ngrams ending in </s> get probability 1.
# this avoids an end-of-sentence penalty in rescoring.
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-hiddens-lm.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $
#
BEGIN {
sent_start = "<s>";
sent_end = "</s>";
hiddens = "<#s>";
remove_old_ngrams = 0;
}
NF==0 {
print; next;
}
/^ngram *[0-9][0-9]*=/ {
print;
next;
}
/^.[0-9]-grams:/ {
currorder=substr($0,2,1);
}
/^\\/ {
print; next;
}
#
currorder && currorder < highorder {
if (NF < currorder + 2) {
print $0 "\t0";
} else {
print;
}
next;
}
$0 ~ sent_start || $0 ~ sent_end {
oldline = $0;
# modify sentence initial/final ngrams
if ($2 == sent_end && currorder == 1) {
sos_uniprob = $1;
if (no_s_end) {
# set </s> prob to 1
$1 = 0;
}
if (!remove_old_ngrams) {
print;
}
next;
} else if ($2 == sent_start && currorder == 1) {
if (no_s_start) {
# set <s> backoff weight to 1
$3 = 0;
}
if (!remove_old_ngrams) {
print;
}
# use unigram prob from </s>
if (sos_uniprob == "") {
print "warning: could not find " sent_end " unigram" \
>> "/dev/stderr";
} else {
oldline = sos_uniprob "\t" $2 "\t" $3;
}
} else if ($2 == sent_start) {
# suppress other ngrams starting with <s>
if (!no_s_start && !remove_old_ngrams) {
print;
}
} else if ($(currorder + 1) == sent_end) {
if (no_s_end) {
# set </s> prob to 1
$1 = 0;
}
if (!remove_old_ngrams) {
print;
}
}
# replace <s> and </s> with <#s> and output result
gsub(sent_start, hiddens, oldline);
gsub(sent_end, hiddens, oldline);
print oldline;
next;
}
{ print }

View File

@@ -0,0 +1,82 @@
#!/usr/local/bin/gawk -f
#
# make-kn-counts --
# Modify N-gram counts for KN smoothing
#
# This duplicates the action of ModKneserNey::prepareCounts().
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-kn-counts.gawk,v 1.5 2007/06/16 04:51:18 stolcke Exp $
#
BEGIN {
order = 3;
no_max_order = 0;
sent_start = "<s>";
ngram_count = "ngram-count";
output = "-";
max_per_file = 0;
file_no = 0;
ngram_no = 0;
}
function set_output () {
close(output_cmd);
ngram_cmd = ngram_count " -order " order " -read - -sort -write ";
if (max_per_file > 0) {
output_cmd = ngram_cmd output "-" ++file_no ".ngrams.gz";
} else {
output_cmd = ngram_cmd output;
}
}
NR == 1 {
kndiscount[1] = kndiscount1;
kndiscount[2] = kndiscount2;
kndiscount[3] = kndiscount3;
kndiscount[4] = kndiscount4;
kndiscount[5] = kndiscount5;
kndiscount[6] = kndiscount6;
kndiscount[7] = kndiscount7;
kndiscount[8] = kndiscount8;
kndiscount[9] = kndiscount9;
if (output == "-") {
max_per_file = 0;
}
set_output();
}
# discard ngrams not used in LM building
NF - 1 > order {
next;
}
# keep ngrams not subject to KN discounting, or those starting with <s>
# if desired, highest-order ngrams are discarded to save space
NF - 1 == order || !kndiscount[NF - 1] || $1 == sent_start {
if (!no_max_order || NF - 1 < order) {
if (max_per_file > 0 && ++ngram_no % max_per_file == 0) {
ngram_no = 0;
set_output();
}
print | output_cmd;
}
}
# modify lower-order ngrams subject to KN discounting
NF - 2 < order && kndiscount[NF - 2] && $2 != sent_start {
$1 = $NF = "";
if (max_per_file > 0 && ++ngram_no % max_per_file == 0) {
ngram_no = 0;
set_output();
}
# we let ngram-count add up the new counts for us
print $0, 1 | output_cmd;
}

View File

@@ -0,0 +1,119 @@
#!/usr/local/bin/gawk -f
#
# make-kn-discounts --
# generate modified Kneser-Ney discounting parameters from a
# count-of-count file
#
# The purpose of this script is to do the KN computation off-line,
# without ngram-count having to read all counts into memory.
# The output is compatible with the ngram-count -kn<n> options.
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-kn-discounts.gawk,v 1.7 2015-05-27 08:10:52 stolcke Exp $
#
# usage: make-kn-discounts modified=<0|1> min=<mincount> countfile
#
BEGIN {
min = 1;
modified = 1;
}
/^#/ {
# skip comments
next;
}
{
countOfCounts[$1] = $2;
if ($1 != "total" && $1 > maxCount && $2 > 0) {
maxCount = $1;
}
}
#
# Estimate missing counts-of-counts f(k) based on the empirical law
#
# log f(k) - log f(k+1) = a / k
#
# for some constant a dependent on the distribution.
#
function handle_missing_counts() {
#
# compute average a value based on well-defined counts-of-counts
#
a_sum = 0;
for (k = maxCount - 1; k > 0; k --) {
if (countOfCounts[k] == 0) break;
a = k * (log(countOfCounts[k]) - log(countOfCounts[k + 1]));
if (debug) {
print "k = " k ", a = " a > "/dev/stderr";
}
a_sum += a;
}
if (maxCount - 1 == k) {
# no data to estimate a, give up
return;
}
avg_a = a_sum / (maxCount - k - 1);
if (debug) {
print "average a = " avg_a > "/dev/stderr";
}
## print "avg_a", avg_a > "/dev/stderr";
for ( ; k > 0; k --) {
if (countOfCounts[k] == 0) {
countOfCounts[k] = exp(log(countOfCounts[k + 1]) + avg_a / k);
print "estimating missing count-of-count " k \
" = " countOfCounts[k] > "/dev/stderr";
}
}
}
END {
# Code below is essentially identical to ModKneserNey::estimate()
# (Discount.cc).
handle_missing_counts();
if (countOfCounts[1] == 0 || \
countOfCounts[2] == 0 || \
modified && countOfCounts[3] == 0 || \
modified && countOfCounts[4] == 0) \
{
printf "error: one of required counts of counts is zero\n" \
>> "/dev/stderr";
exit(2);
}
Y = countOfCounts[1]/(countOfCounts[1] + 2 * countOfCounts[2]);
if (modified) {
discount1 = 1 - 2 * Y * countOfCounts[2] / countOfCounts[1];
discount2 = 2 - 3 * Y * countOfCounts[3] / countOfCounts[2];
discount3plus = 3 - 4 * Y * countOfCounts[4] / countOfCounts[3];
} else {
# original KN discounting
discount1 = discount2 = discount3plus = Y;
}
print "mincount", min;
print "discount1", discount1;
print "discount2", discount2;
print "discount3+", discount3plus;
# check for invalid values after output, so we see where the problem is
if (discount1 < 0 || discount2 < 0 || discount3plus < 0) {
printf "error: one of modified KneserNey discounts is negative\n" \
>> "/dev/stderr";
exit(2);
}
}

View File

@@ -0,0 +1,32 @@
#!/usr/local/bin/gawk -f
#
# filter a backoff model with a count file, so that only ngrams
# in the countfile are represented in the output
#
# usage: make-lm-subset count-file bo-file
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-lm-subset.gawk,v 1.3 1999/10/17 06:10:10 stolcke Exp $
#
ARGIND==1 {
ngram = $0;
sub("[ ]*[0-9]*$", "", ngram);
count[ngram] = 1;
next;
}
ARGIND==2 && /^$/ {
print; next;
}
ARGIND==2 && /^\\/ {
print; next;
}
ARGIND==2 && /^ngram / {
print; next;
}
ARGIND==2 {
ngram = $0;
# strip numeric stuff
sub("^[-.e0-9]*[ ]*", "", ngram);
sub("[ ]*[-.e0-9]*$", "", ngram);
if (count[ngram]) print;
next;
}

View File

@@ -0,0 +1,73 @@
#!/usr/local/bin/gawk -f
#
# make-meta-counts --
# Apply N-gram count cut-offs and insert meta-counts (counts-of-counts)
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-meta-counts.gawk,v 1.2 2002/07/22 21:24:45 stolcke Exp $
#
BEGIN {
order = 3;
# trust_total=1 means we don't have to generate meta-counts, just
# apply the cut-offs (in combination with ngram-count -trust-totals)
trust_totals = 0;
metatag = "__META__";
}
NR == 1 {
mincount[1] = mincount1 + 0;
mincount[2] = mincount2 + 0;
mincount[3] = mincount3 + 0;
mincount[4] = mincount4 + 0;
mincount[5] = mincount5 + 0;
mincount[6] = mincount6 + 0;
mincount[7] = mincount7 + 0;
mincount[8] = mincount8 + 0;
mincount[9] = mincount9 + 0;
}
NF > order + 1 {
next;
}
NF > 1 {
this_order = NF - 1;
if (!trust_totals) {
# output buffered ngrams of higher order IF there was at least
# one non-meta count of the respective order
for (i = order; i > this_order; i --) {
if (have_counts[i]) {
printf "%s", buffer[i];
have_counts[i] = 0;
}
delete buffer[i];
}
}
if ($NF < mincount[this_order]) {
if (trust_totals) {
next;
} else {
# convert below-cutoff ngram to meta-ngram
$this_order = metatag int($NF);
$NF = 1;
# add it to buffer
buffer[this_order] = buffer[this_order] $0 "\n";
}
} else {
have_counts[this_order] = 1;
print;
}
}
END {
# output any remaining buffered ngrams
for (i = order; i >= 1; i --) {
if (have_counts[i]) {
printf "%s", buffer[i];
}
}
}

View File

@@ -0,0 +1,70 @@
#!/bin/sh
#
# make-multiword-pfsg --
# rewrite a PFSG in terms of multiwords
#
# usage: make-multiword-pfsg multiword-defs [pfsg] > new-pfsg
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-multiword-pfsg,v 1.5 2015-07-03 03:45:39 stolcke Exp $
#
multiword_defs=${1}
shift
tmpdir=${TMPDIR-/tmp}
name="$tmpdir/name.$$"
vocab="$tmpdir/vocab.$$"
old_fsm="$tmpdir/infsm.$$.gz"
class_fsm="$tmpdir/classfsm.$$"
class_fsmc="$tmpdir/classfsmc.$$"
mw_symbols="$tmpdir/mw_symbols.$$"
word_symbols="$tmpdir/word_symbols.$$"
trap "rm -f $name $vocab $old_fsm $class_fsm $class_fsmc $mw_symbols $word_symbols; exit" 0 1 2 15
#
# extract vocab and convert PFSG to FSM
#
${GAWK-gawk} -v name=$name -v vocab=$vocab '$1 == "name" && !have_name {
have_name = 1;
print $2 > name;
}
$1 == "nodes" {
# collect vocabulary
for (i = 3; i <= NF; i ++) {
if ($i != "NULL") is_word[$i] = 1;
}
}
{ print;
}
END {
for (word in is_word) {
print word > vocab
}
}' "$@" | \
pfsg-to-fsm symbolic=1 | \
gzip > $old_fsm
new_name=`cat $name`_multiwords
#
# create multiword transducer
# Note: this is the same as reversed class-transducer
#
classes-to-fsm vocab=$vocab symbolic=1 \
isymbolfile=$mw_symbols \
osymbolfile=$word_symbols \
$multiword_defs > $class_fsm
fsmcompile -t -i $mw_symbols -o $word_symbols $class_fsm | \
fsminvert > $class_fsmc
#
# compose original FSM with multiword transducer;
# then convert back to PFSG
#
{ gzip -dcf $old_fsm; rm -f $old_fsm; } | fsmcompile -i $word_symbols | \
fsmcompose - $class_fsmc | fsmproject -o | \
fsmprint -i $mw_symbols | fsm-to-pfsg pfsg_name=$new_name

View File

@@ -0,0 +1,135 @@
#!/usr/local/bin/gawk -f
#
# nbest2pfsg --
# convert Decipher N-best list to PFSG lattice
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-nbest-pfsg.gawk,v 1.5 2004/11/02 02:00:35 stolcke Exp $
#
BEGIN {
initial = 0;
final = 1;
nodecount = 2;
transcount = 0;
null = "NULL";
outputs[initial] = outputs[final] = null;
format = 0;
name = "";
notree = 0; # do build prefix tree
scale = 0; # scaling factor for log posteriors
amw = 1; # acoustic model weight
lmw = 8; # language model weight
wtw = 0; # word transition weight
}
function start_hyp() {
lastnode = initial;
}
function add_word(word, weight) {
nextnode = tree[lastnode " " word];
if (nextnode && !notree) {
if (weights[lastnode " " nextnode] != weight) {
printf "inconsistent weight for transition %s -> %s\n",\
lastnode, nextnode >> "/dev/stderr";
exit 1;
}
lastnode = nextnode;
} else {
newnode = nodecount ++;
outputs[newnode] = word;
tree[lastnode " " word] = newnode;
weights[lastnode " " newnode] = weight;
transcount ++;
lastnode = newnode;
}
}
function end_hyp(weight) {
nextnode = tree[lastnode " " null];
if (nextnode && !notree) {
if (weights[lastnode " " nextnode] != weight) {
printf "inconsistent final weight for %s\n",\
lastnode >> "/dev/stderr";
exit 1;
}
} else {
tree[lastnode " " null] = final;
weights[lastnode " " final] = weight;
transcount ++;
}
}
function print_pfsg(name) {
printf "name %s\n", name;
printf "nodes %d", nodecount;
for (node = 0; node < nodecount; node ++) {
printf " %s", outputs[node];
}
printf "\n";
printf "initial %d\n", initial;
printf "final %d\n", final;
printf "transitions %d\n", transcount;
for (trans in weights) {
split(trans, a);
fromnode = a[1];
tonode = a[2];
printf "%d %d %g\n", fromnode, tonode, \
weights[fromnode " " tonode];
}
printf "\n";
}
/^NBestList1\.0/ {
format = 1;
next;
}
/^NBestList2\.0/ {
format = 2;
next;
}
format == 0 {
totalscore = scale * (amw * $1 + lmw * $2 + wtw * $3);
start_hyp();
for (i = 4; i <= NF; i ++) {
add_word($i, 0);
}
end_hyp(totalscore);
next;
}
format == 1 {
totalscore = scale * substr($1, 2, length($1)-2);
start_hyp();
for (i = 2; i <= NF; i ++) {
add_word($i, 0);
}
end_hyp(totalscore);
next;
}
format == 2 {
start_hyp();
for (i = 2; i <= NF; i += 11) {
add_word($i, scale * ($(i + 7) + $(i + 9)));
}
end_hyp(0);
next;
}
END {
if (!name) {
name = FILENAME;
}
print_pfsg(name);
}

View File

@@ -0,0 +1,351 @@
#!/usr/local/bin/gawk -f
#
# make-ngram-pfsg --
# Create a Decipher PFSG from an N-gram language model
#
# usage: make-ngram-pfsg [debug=1] [check_bows=1] [maxorder=N] [no_empty_bo=1] backoff-lm > pfsg
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-ngram-pfsg.gawk,v 1.32 2015-07-03 03:45:38 stolcke Exp $
#
#########################################
#
# Output format specific code
#
BEGIN {
logscale = 2.30258509299404568402 * 10000.5;
round = 0.5;
start_tag = "<s>";
end_tag = "</s>";
null = "NULL";
version = 0;
top_level_name = "";
no_empty_bo = 0;
if ("TMPDIR" in ENVIRON) {
tmpdir = ENVIRON["TMPDIR"];
} else {
tmpdir = "/tmp"
}
if ("pid" in PROCINFO) {
pid = PROCINFO["pid"];
} else {
getline pid < "/dev/pid";
}
tmpfile = tmpdir "/pfsg." pid;
# hack to remove tmpfile when killed
trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
print "" | trap_cmd;
debug = 0;
write_contexts = "";
read_contexts = "";
}
function rint(x) {
if (x < 0) {
return int(x - round);
} else {
return int(x + round);
}
}
function scale_log(x) {
return rint(x * logscale);
}
function output_for_node(name) {
num_words = split(name, words);
if (num_words == 0) {
print "output_for_node: got empty name" >> "/dev/stderr";
exit(1);
} else if (words[1] == bo_name) {
return null;
} else if (words[num_words] == end_tag || \
words[num_words] == start_tag)
{
return null;
} else {
return words[num_words];
}
}
function node_exists(name) {
return (name in node_num);
}
function node_index(name) {
i = node_num[name];
if (i == "") {
i = num_nodes ++;
node_num[name] = i;
node_string[i] = output_for_node(name);
if (debug) {
print "node " i " = " name ", output = " node_string[i] \
>> "/dev/stderr";
}
}
return i;
}
function start_grammar(name) {
num_trans = 0;
num_nodes = 0;
return;
}
function end_grammar(name) {
if (!node_exists(start_tag)) {
print start_tag " tag undefined in LM" >> "/dev/stderr";
exit(1);
} else if (!node_exists(end_tag)) {
print end_tag " tag undefined in LM" >> "/dev/stderr";
exit(1);
}
printf "%d pfsg nodes\n", num_nodes >> "/dev/stderr";
printf "%d pfsg transitions\n", num_trans >> "/dev/stderr";
# output version id if supplied
if (version) {
print "version " version "\n";
}
# use optional top-level grammar name if given
print "name " (top_level_name ? top_level_name : name);
printf "nodes %s", num_nodes;
for (i = 0; i < num_nodes; i ++) {
printf " %s", node_string[i];
}
printf "\n";
print "initial " node_index(start_tag);
print "final " node_index(end_tag);
print "transitions " num_trans;
fflush();
if (close(tmpfile) < 0) {
print "error closing tmp file" >> "/dev/stderr";
exit(1);
}
system("/bin/cat " tmpfile);
}
function add_trans(from, to, prob) {
if (debug) {
print "add_trans " from " -> " to " " prob >> "/dev/stderr";
}
num_trans ++;
print node_index(from), node_index(to), scale_log(prob) > tmpfile;
}
#########################################
#
# Generic code for parsing backoff file
#
BEGIN {
maxorder = 0;
grammar_name = "PFSG";
bo_name = "__BACKOFF__";
start_bo_name = bo_name " __FROM_START__";
check_bows = 0;
epsilon = 1e-5; # tolerance for lowprob detection
}
NR == 1 {
start_grammar(grammar_name);
if (read_contexts) {
while ((getline context < read_contexts) > 0) {
is_context[context] = 1;
}
close(read_contexts);
}
}
NF == 0 {
next;
}
/^ngram *[0-9][0-9]*=/ {
num_grams = substr($2,index($2,"=")+1);
if (num_grams > 0) {
order = substr($2,1,index($2,"=")-1);
# limit maximal N-gram order if desired
if (maxorder > 0 && order > maxorder) {
order = maxorder;
}
if (order == 1) {
grammar_name = "UNIGRAM_PFSG";
} else if (order == 2) {
grammar_name = "BIGRAM_PFSG";
} else if (order == 3) {
grammar_name = "TRIGRAM_PFSG";
} else {
grammar_name = "NGRAM_PFSG";
}
}
next;
}
/^\\[0-9]-grams:/ {
currorder = substr($0,2,1);
next;
}
/^\\/ {
next;
}
#
# unigram parsing
#
currorder == 1 {
first_word = last_word = ngram = $2;
ngram_prefix = ngram_suffix = "";
# we need all unigram backoffs (except for </s>),
# so fill in missing bow where needed
if (NF == 2 && last_word != end_tag) {
$3 = 0;
}
}
#
# bigram parsing
#
currorder == 2 {
ngram_prefix = first_word = $2;
ngram_suffix = last_word = $3;
ngram = $2 " " $3;
}
#
# trigram parsing
#
currorder == 3 {
first_word = $2;
last_word = $4;
ngram_prefix = $2 " " $3;
ngram_suffix = $3 " " $4;
ngram = ngram_prefix " " last_word;
}
#
# higher-order N-gram parsing
#
currorder >= 4 && currorder <= order {
first_word = $2;
last_word = $(currorder + 1);
ngram_infix = $3;
for (i = 4; i <= currorder; i ++ ) {
ngram_infix = ngram_infix " " $i;
}
ngram_prefix = first_word " " ngram_infix;
ngram_suffix = ngram_infix " " last_word;
ngram = ngram_prefix " " last_word;
}
#
# shared code for N-grams of all orders
#
currorder <= order {
prob = $1;
bow = $(currorder + 2);
# skip backoffs that exceed maximal order,
# but always include unigram backoffs
if (bow != "" && (currorder == 1 || currorder < order)) {
# remember all LM contexts for creation of N-gram transitions
bows[ngram] = bow;
# To avoid empty paths through backoff, we reroute transitions
# out of the start node to a special backoff node that does not
# connect directly to the end node.
if (no_empty_bo && ngram == start_tag) {
this_bo_name = start_bo_name;
} else {
this_bo_name = bo_name;
}
# insert backoff transitions
if (read_contexts ? (ngram in is_context) : \
(currorder < order - 1)) \
{
add_trans(this_bo_name " " ngram, this_bo_name " " ngram_suffix, bow);
add_trans(ngram, this_bo_name " " ngram, 0);
} else {
add_trans(ngram, this_bo_name " " ngram_suffix, bow);
}
if (write_contexts) {
print ngram_suffix > write_contexts;
}
}
if (last_word == start_tag) {
if (currorder > 1) {
printf "warning: ignoring ngram into start tag %s -> %s\n", \
ngram_prefix, last_word >> "/dev/stderr";
}
} else {
# insert N-gram transition to maximal suffix of target context
if (last_word == end_tag) {
target = end_tag;
} else if (ngram in bows || currorder == 1) {
# the minimal context is unigram
target = ngram;
} else if (ngram_suffix in bows) {
target = ngram_suffix;
} else {
target = ngram_suffix;
for (i = 3; i <= currorder; i ++) {
target = substr(target, length($i) + 2);
if (target in bows) break;
}
}
if (currorder == 1 || \
(read_contexts ? (ngram_prefix in is_context) : \
(currorder < order))) \
{
add_trans(bo_name " " ngram_prefix, target, prob);
# Duplicate transitions out of unigram backoff for the
# start-backoff-node
if (no_empty_bo && \
node_exists(start_bo_name " " ngram_prefix) && \
target != end_tag)
{
add_trans(start_bo_name " " ngram_prefix, target, prob);
}
} else {
add_trans(ngram_prefix, target, prob);
}
if (check_bows) {
if (currorder < order) {
probs[ngram] = prob;
}
if (ngram_suffix in probs && \
probs[ngram_suffix] + bows[ngram_prefix] - prob > epsilon)
{
printf "warning: ngram loses to backoff %s -> %s\n", \
ngram_prefix, last_word >> "/dev/stderr";
}
}
}
}
END {
end_grammar(grammar_name);
}

View File

@@ -0,0 +1,49 @@
#!/usr/local/bin/gawk -f
#
# make-sub-lm --
# extract a lower-order backoff LM from a higher order one.
#
# usage: make-sub-lm maxorder=<n> lm-file > sub-lm-file
#
# $Header: /home/srilm/CVS/srilm/utils/src/make-sub-lm.gawk,v 1.2 1998/11/09 05:54:12 stolcke Exp $
#
BEGIN {
maxorder=2;
}
NF==0 {
print; next;
}
/^ngram *[0-9][0-9]*=/ {
order = substr($2,1,index($2,"=")-1);
if (order <= maxorder) print;
next;
}
/^\\[0-9]-grams:/ {
currorder=substr($0,2,1);
if (currorder <= maxorder) {
print;
} else {
print "\n\\end\\";
exit;
}
next;
}
/^\\/ {
print; next;
}
currorder {
if (currorder < maxorder) {
print;
} else if (currorder == maxorder) {
#
# delete backoff weight for maximal ngram
#
if (NF == currorder + 2) {
$NF = "";
}
print;
}
next;
}
{ print }

View File

@@ -0,0 +1,133 @@
#!/bin/sh
#
# merge-batch-counts --
# combine batch count files into a single count file
#
# $Header: /home/srilm/CVS/srilm/utils/src/merge-batch-counts,v 1.9 2013/03/19 18:37:51 stolcke Exp $
#
merge_options=
merge_size=2
while [ $# -gt 0 ]; do
case "$1" in
-float-counts)
merge_options=-float-counts
shift
;;
-l) merge_size=$2
shift; shift
;;
*) break
;;
esac
done
if [ $# -lt 1 ]; then
echo "usage: $0 [-float-counts] [-l N] countdir [file-list | start-iter]" >&2
exit 2
fi
countdir=${1-./counts}
filelist=$2
iter=0
mergedir=$countdir
merger=ngram-merge
newfilefile=$mergedir/newfiles$$
set -e
# find right xarg option
if xargs -L1 </dev/null >/dev/null 2>&1; then
xargs_l=L
else
xargs_l=l
fi
# make sure partially generated files are deleted
trap 'rm -f $newfile $newfilefile $test_in $test_out; exit 1' 1 2 15
# determine if ngram-merge can generate compressed files
test_in=$mergedir/testin
test_out=$mergedir/testout.gz
echo "x 1" > $test_in
$merger -write $test_out $test_in $test_in
if gzip -l $test_out >/dev/null 2>&1; then
gz=.gz
else
gz=
fi
rm $test_in $test_out
case X$filelist in
X[0-9]*)
# restart a previous run
what=merge
iter=`expr $filelist + 1`
infiles=$mergedir/$what-iter$iter.files
find $countdir/. \( \
-name $what-iter$filelist-\*.ngrams.gz -o \
-name $what-iter$filelist-\*.ngrams \) -print | \
sort | xargs -${xargs_l}2 /bin/echo > $infiles
;;
X)
what=merge
infiles=$mergedir/$what-iter$iter.files
find $countdir/. \( \
-name \*.ngrams.gz -o \
-name \*.ngrams \) -print | sort | \
xargs -${xargs_l}2 /bin/echo > $infiles
;;
X*)
what=`basename $filelist .files`
infiles=$mergedir/$what-iter$iter.files
cat $filelist > $infiles
;;
esac
numfiles=`wc -w < $infiles`
while [ $numfiles -gt 1 ]; do
echo "ITERATION $iter, $numfiles files" >&2
fileno=1
> $newfilefile
while read file1 morefiles; do
newfile=$mergedir/$what-iter$iter-$fileno.ngrams$gz
if [ -f $newfile ]; then
echo "retaining old $newfile" >&2
echo $newfile >>$newfilefile
elif [ -z "$morefiles" ]; then
echo "linking $file1 to $newfile" >&2
rm -f $newfile
ln $file1 $newfile
# put the linked file at the top of the file list
# for the next iteration, to keep file sizes balanced
mv $newfilefile $newfilefile.old
echo $newfile >$newfilefile
cat $newfilefile.old >> $newfilefile
rm $newfilefile.old
else
echo "merging $file1 $morefiles into $newfile" >&2
$merger $merge_options -write $newfile $file1 $morefiles
echo $newfile >>$newfilefile
fi
fileno=`expr $fileno + 1`
done < $infiles
xargs rm -f < $infiles
iter=`expr $iter + 1`
infiles=$mergedir/$what-iter$iter.files
cat $newfilefile | xargs -${xargs_l}$merge_size /bin/echo > $infiles
numfiles=`wc -w < $infiles`
done
rm -f $newfilefile
echo "final counts in `cat $infiles`" >&2

View File

@@ -0,0 +1,180 @@
#!/usr/local/bin/gawk -f
#
# merge-nbest --
# merge hyps from multiple N-best lists into a single list
#
# $Header: /home/srilm/CVS/srilm/utils/src/merge-nbest.gawk,v 1.8 2010/08/20 00:17:18 stolcke Exp $
#
BEGIN {
M_LN10 = 2.30258509299404568402; # from <math.h>
logINF = -320;
bytelogscale = M_LN10 * 10000.5 / 1024.0;
use_orig_hyps = 1;
add_scores = 0;
last_nbestformat = -1;
nbestmagic1 = "NBestList1.0";
nbestmagic2 = "NBestList2.0";
pause = "-pau-";
max_nbest = 0;
multiwords = 0;
multichar = "_";
nopauses = 0;
}
function log10(x) {
return log(x) / M_LN10;
}
function exp10(x) {
if (x < logINF) {
return 0;
} else {
return exp(x * M_LN10);
}
}
function addlogs(x,y) {
if (x<y) {
temp = x; x = y; y = temp;
}
return x + log10(1 + exp10(y - x));
}
function process_nbest(file) {
input = "exec gzip -dcf " file;
nbestformat = 0;
num_hyps = 0;
while ((status = (input | getline)) > 0) {
if ($1 == nbestmagic1) {
nbestformat = 1;
} else if ($1 == nbestmagic2) {
nbestformat = 2;
} else {
words = "";
num_words = 0;
num_hyps ++;
if (max_nbest > 0 && num_hyps > max_nbest) {
break;
}
if (nbestformat == 1) {
for (i = 2; i <= NF; i++) {
words = words " " $i;
if ($i != pause) num_words ++;
}
score = substr($1, 2, length($1)-2)/bytelogscale;
num_words = 1;
} else if (nbestformat == 2) {
prev_end_time = -1;
for (i = 2; i <= NF; i += 11) {
start_time = $(i + 3);
end_time = $(i + 5);
# skip tokens that are subsumed by the previous word
# (this eliminates phone and state symbols)
# XXX: due to a bug in Decipher some state tags have
# incorrect timemarks. We filter them based on their
# token string.
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
words = words " " $i;
if ($i != pause) num_words ++;
prev_end_time = end_time;
}
}
score = substr($1, 2, length($1)-2)/bytelogscale;
} else {
for (i = 4; i <= NF; i++) {
words = words " " $i;
}
score = $1 + 8 * $2;
num_words = $3;
}
# resolve multiwords and eliminate pauses if so desired
if (multiwords) {
gsub(multichar, " ", words);
}
if (nopauses) {
gsub(" " pause, " ", words);
}
# if word sequence is new, record it
if (!(words in scores)) {
scores[words] = score;
hyps[words] = $0;
nwords[words] = num_words;
} else if (add_scores) {
scores[words] = addlogs(scores[words], score);
}
if (last_nbestformat < 0) {
last_nbestformat = nbestformat;
} else if (nbestformat != last_nbestformat) {
use_orig_hyps = 0;
last_nbestformat = nbestformat;
}
}
}
if (status < 0) {
print "error opening " file >> "/dev/stderr";
}
close(input);
}
function output_nbest() {
if (!use_orig_hyps || use_orig_hyps && last_nbestformat == 1) {
print nbestmagic1;
} else if (use_orig_hyps && last_nbestformat == 2) {
print nbestmagic2;
}
for (words in scores) {
if (add_scores) {
print scores[words], 0, nwords[words], words;
} else if (use_orig_hyps) {
print hyps[words];
} else {
print "(" (scores[words] * bytelogscale) ")" words;
}
}
}
BEGIN {
if (ARGC < 2) {
print "usage: " ARGV[0] " N-BEST1 N-BEST2 ..." \
>> "/dev/stderr";
exit(2);
}
for (arg = 1; arg < ARGC; arg ++) {
if (equals = index(ARGV[arg], "=")) {
var = substr(ARGV[arg], 1, equals - 1);
val = substr(ARGV[arg], equals + 1);
if (var == "multiwords") {
multiwords = val + 0;
} else if (var == "multichar") {
multichar = val;
} else if (var == "max_nbest") {
max_nbest = val + 0;
} else if (var == "nopauses") {
nopauses = val + 0;
} else if (var == "use_orig_hyps") {
use_orig_hyps = val + 0;
} else if (var == "add_scores") {
add_scores = val + 0;
}
} else {
process_nbest(ARGV[arg]);
}
}
output_nbest();
}

View File

@@ -0,0 +1,337 @@
#!/usr/local/bin/gawk -f
#
# metadb --
# access the META-DB
#
# These files are subject to the SRILM Community Research License Version
# 1.0 (the "License"); you may not use these files except in compliance
# with the License. A copy of the License is included in the SRILM root
# directory. Software distributed under the License is distributed on an
# "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
# See the License for the specific language governing rights and
# limitations under the License. This software is Copyright (c) SRI
# International, 1995-2011. All rights reserved.
#
# $Header: /home/srilm/CVS/srilm/utils/src/metadb.gawk,v 1.26 2011/11/26 06:22:34 stolcke Exp $
#
function do_defines() {
# process all defines
for (d in defines) {
gsub(d, defines[d]);
}
# remove leading and trailing whitespace from value
sub("^[ ]*", "");
sub("[ ]*$", "");
}
function print_error(msg) {
print filename ", line " lineno ": " msg >> "/dev/stderr";
}
# process an included file
# return 1 is caller should quit reading, 0 if not
function process_config_file(file) {
if (file in including) {
print "metadb INCLUDE looping through " file >> "/dev/stderr";
exit 2
}
including[file] = 1;
if (trace_includes) {
print "READING " file >> "/dev/stderr";
}
filename = file;
lineno = 0;
while ((status = (getline < file)) > 0) {
lineno ++;
# skip comments and empty lines
if (NF == 0 || $1 ~ /^#/) {
continue;
}
if ($1 == "DEFINE") {
if (NF < 2) {
print_error("incomplete DEFINE");
exit 2;
} else {
symbol = $2;
$1 = $2 = "";
do_defines();
defines[symbol] = $0;
}
} else if ($1 == "SDEFINE") {
if (NF < 2) {
print_error("incomplete SDEFINE");
exit 2;
} else {
symbol = $2;
$1 = $2 = "";
do_defines();
# run right-hand-side as command and use output as value
$0 | getline defines[symbol];
close($0);
}
} else if ($1 == "MDEFINE") {
if (NF < 2) {
print_error("incomplete MDEFINE");
exit 2;
} else if (!recursive) {
symbol = $2;
$1 = $2 = "";
# look up the right-hand-side as metadb key,
# avoiding recursive invocations
db_command = "metadb -recursive -config " config_file " " $0;
if (debug) {
print "metadb: " symbol " mdefined by: " db_command >> "/dev/stderr";
}
db_command | getline defines[symbol];
close(db_command);
}
} else if ($1 == "UNDEF") {
if (NF < 2) {
print_error("incomplete UNDEF");
exit 2;
} else {
delete defines[$2];
}
} else if ($1 == "INCLUDE") {
if (NF < 2) {
print_error("missing INCLUDE filename");
exit 1
} else {
$1 = "";
do_defines();
if (! ($0 ~ /^\//)) {
includefile = file;
sub("[^/]*$", "", includefile);
if (includefile) {
includefile = includefile $0;
} else {
includefile = $0;
}
} else {
includefile = $0;
}
if (process_config_file(includefile)) {
close(file);
delete including[file];
return 1;
}
filename = file;
if (trace_includes) {
print "READING " file >> "/dev/stderr";
}
}
} else if ($1 == "ALIAS") {
if (NF != 3 || $2 == $3) {
print_error("illegal ALIAS");
exit 2
}
if (dump_values) print $0;
if ($2 == key) {
if (debug) {
print "metadb: " key " redirected to " $3 >> "/dev/stderr";
}
# close all currently read files so they can be read again
# from the top
for (f in including) {
close(f)
}
# forget all current file inclusions
delete including;
key = $3;
return process_config_file(config_file);
}
} else if ($1 == "ALIAS_SUFFIX") {
if (NF != 3 || $2 == $3) {
print_error("illegal ALIAS_SUFFIX");
exit 2
}
if (dump_values) print $0;
suffix_len = length($2);
key_len = length(key);
key_prefix = substr(key, 1, key_len-suffix_len);
if ($2 == substr(key, key_len-suffix_len+1) && !index(key_prefix, "_")) {
# close all currently read files so they can be read again
# from the top
for (f in including) {
close(f)
}
# forget all current file inclusions
delete including;
old_key = key;
key = key_prefix $3;
if (debug) {
print "metadb: " old_key " redirected to " key >> "/dev/stderr";
}
return process_config_file(config_file);
}
} else if ($1 == key || dump_values) {
this_key = $1;
$1 = "";
do_defines();
if ($0 == "__END__") {
if (dump_values) {
have_keys[this_key] = 1;
continue;
} else {
close(file);
delete including[file];
return 1;
}
}
if (query_mode) {
exit 0;
} else if (dump_values) {
# when dumping all keys, output the first key value found
if (!(this_key in have_keys)) {
print this_key, $0;
if (!all_values) {
have_keys[this_key] = 1;
}
}
} else {
if (debug) {
print "metadb: " key "=" $0 >> "/dev/stderr";
}
if (!error_mode || $0 != "") {
key_found = 1;
print;
}
}
if (!all_values && !dump_values) {
close(file);
delete including[file];
return 1;
}
}
}
if (status < 0) {
print "error reading " file >> "/dev/stderr";
exit 2;
}
close(file);
delete including[file];
return 0;
}
function print_usage() {
print "usage: metadb [-options ...] key1 [key2 ...]";
print "-q query mode -- check if key is defined";
print "-e exit with error message if key is undefined";
print "-all return multiple key values";
print "-dump dump all key and values";
print "-includes list included files";
print "-config FILE set config file (default $" db_config ")";
}
BEGIN {
key = "";
all_values = 0;
dump_values = 0;
trace_includes = 0;
recursive = 0;
db_config = "METADB_CONFIG";
config_file = "";
query_mode = 0;
error_mode = 0;
debug = ENVIRON["METADB_DEBUG"];
for (i = 1; i < ARGC ; i ++) {
if (ARGV[i] == "-q") {
query_mode = 1;
} else if (ARGV[i] == "-e") {
error_mode = 1;
} else if (ARGV[i] == "-all") {
all_values = 1;
} else if (ARGV[i] == "-dump") {
dump_values = 1;
} else if (ARGV[i] == "-includes") {
trace_includes = 1;
} else if (ARGV[i] == "-recursive") {
recursive = 1;
} else if (ARGV[i] == "-config") {
config_file = ARGV[i + 1];
i ++;
} else if (ARGV[i] == "-help") {
print_usage();
exit 0;
} else if (ARGV[i] ~ /^-/) {
print "unknown option: " ARGV[i] >> "/dev/stderr";
exit 2;
} else {
break;
}
}
if (!config_file) {
if (db_config in ENVIRON) {
config_file = ENVIRON[db_config];
} else {
print db_config " not defined" >> "/dev/stderr";
exit 1;
}
}
if (config_file == "") {
print "empty config file name" >> "/dev/stderr";
exit 1;
}
if (dump_values) {
key = "";
process_config_file(config_file);
}
for ( ; i < ARGC ; i ++) {
key = ARGV[i];
key_found = 0;
process_config_file(config_file);
if (error_mode && !key_found) {
print "key \"" key "\" empty or not defined in " config_file \
>> "/dev/stderr";
exit 1;
}
}
if (query_mode) {
# we only get here if nothing was found, so return with error
exit 1;
}
}

View File

@@ -0,0 +1,56 @@
#!/bin/sh
#
# nbest-error --
# compute minimum error of nbest lists
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-error,v 1.6 2013/03/09 07:13:02 stolcke Exp $
#
if [ $# -lt 2 ]; then
echo "usage: $0 score-dir refs [nbest-lattice-option ...]" >&2
echo " or $0 file-list refs [nbest-lattice-option ...]" >&2
exit 2
fi
scoredir="$1"
refs="$2"
shift; shift
option=-nbest-error
case "$*" in
*-lattice-error*) option= ;;
esac
if [ ! -r $scoredir ]; then
echo "$0: cannot access $scoredir" >&2
exit 1
fi
if [ ! -r $refs ]; then
echo "$0: cannot access $refs" >&2
exit 1
fi
if [ -d $scoredir ]; then
find $scoredir -follow \
-type f \( -name \*.score -o \
-name \*.Z -o \
-name \*.gz \) \
-print | sort
else
cat $scoredir
fi | \
nbest-lattice -nbest-files - -refs $refs $option "$@" | \
${GAWK-gawk} '
$2 ~ /^[0-9]*$/ && $10 ~ /^[0-9]*$/ && $9 == "words" {
nsents ++;
nwords += $10;
nerrors += $2;
print;
}
END {
printf "%d sentences, %d words, %d errors (%.2f%%)\n", \
nsents, nwords, nerrors, 100*nerrors/nwords;
}'

View File

@@ -0,0 +1,96 @@
#!/usr/local/bin/gawk -f
#
# nbest-oov-counts --
# generate OOV counts for an nbest list
#
# usage: nbest-oov-counts vocab=VOCAB [vocab_aliases=ALIASES] NBESTLIST > COUNTS
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-oov-counts.gawk,v 1.2 2017/08/15 19:29:34 stolcke Exp $
#
BEGIN {
nbestformat = 0;
}
$1 ~ /^NBestList1\.0/ {
nbestformat = 1;
next;
}
$1 ~ /^NBestList2\.0/ {
nbestformat = 2;
next;
}
NR == 1 {
nwords = 0;
while ((getline line < vocab) > 0) {
if (split(line, a) > 0) {
in_vocab[a[1]] = 1;
nwords ++;
}
}
print "read " nwords " vocab words" > "/dev/stderr";
naliases = 0;
if (vocab_aliases) {
while ((getline line < vocab_aliases) > 0) {
if (split(line, a) >= 2) {
vocab_mapping[a[1]] = a[2];
naliases ++;
}
}
print "read " naliases " vocab aliases" > "/dev/stderr";
}
# add default vocabulary
in_vocab["<s>"] = 1;
in_vocab["</s>"] = 1;
in_vocab["-pau-"] = 1;
}
function process_word(w) {
if (w in vocab_mapping) {
word = vocab_mapping[w];
} else {
word = w;
}
if (!(word in in_vocab)) {
oov_count ++;
}
}
NF > 1 {
oov_count = 0;
if (nbestformat == 1) {
# for Decipher nbest format 1 we use the aggregate score only
for (i = 2; i <= NF; i ++) {
process_word($i);
}
} else if (nbestformat == 2) {
prev_end_time = -1;
for (i = 2; i <= NF; i += 11) {
start_time = $(i + 3);
end_time = $(i + 5);
# skip tokens that are subsumed by the previous word
# (this eliminates phone and state symbols)
# XXX: due to a bug in Decipher some state tags have incorrect
# timemarks. We filter them based on their token string.
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
process_word($i);
prev_end_time = end_time;
}
}
} else {
for (i = 4; i <= NF; i ++) {
process_word($i);
}
}
print oov_count;
}

View File

@@ -0,0 +1,64 @@
#!/usr/local/bin/gawk -f
#
# nbest-optimize-args-from-rover-control --
# Extract initial score weights and arguments from rover-control file
# for use with nbest-optimize
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-optimize-args-from-rover-control.gawk,v 1.2 2017/08/16 06:34:16 stolcke Exp $
#
BEGIN {
num_extras = 0;
}
# skip comment or empty line
/^##/ || /^[ ]*$/ {
next;
}
# extra score file line
$3 == "+" {
num_extras ++;
extra_dir[num_extras] = $1;
extra_weight[num_extras] = $2;
next;
}
# main system
{
system_dir = $1;
lm_weight = $2;
wt_weight = $3;
max_nbest = $5;
post_scale = $6;
weights = "1 " lm_weight " " wt_weight;
for (i = 1; i <= num_extras; i ++) {
weights = weights " " extra_weight[i];
}
if (print_weights) {
print weights;
} else if (print_dirs) {
for (i = 1; i <= num_extras; i ++) {
print extra_dir[i];
}
} else {
# output all arguments
if (post_scale != "" && post_scale != 0) {
print "-posterior-scale " post_scale;
}
if (max_nbest != "" && max_nbest != 0) {
print "-max-nbest " max_nbest;
}
print "-init-lambdas '" weights "'";
for (i = 1; i <= num_extras; i ++) {
print extra_dir[i];
}
}
num_extras = 0;
}

View File

@@ -0,0 +1,184 @@
#!/usr/local/bin/gawk -f
#
# nbest-posteriors --
# rescale the scores in an nbest list to reflect weighted posterior
# probabilities
#
# usage: nbest-posteriors [ weight=W amw=AMW lmw=LMW wtw=WTW postscale=S max_nbest=M ] NBEST-FILE
#
# The output is the same input NBEST-FILE with acoustic scores set to
# the log10 of the posterior hyp proabilities and LM scores set to zero.
# postscale=S attenuates the posterior distribution by dividing combined log
# scores by S (the default is S=LMW).
#
# If weight=W is specified the posteriors are multiplied by W.
# (This is useful to combine multiple nbest lists in a weighted fashion).
# The input should be in SRILM nbest-format.
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-posteriors.gawk,v 1.14 2019/02/08 14:13:35 stolcke Exp $
#
BEGIN {
M_LN10 = 2.30258509299404568402;
weight = 1.0;
amw = 1.0;
lmw = 8.0;
wtw = 0.0;
postscale = 0;
max_nbest = 0;
logINF = -320; # log10 of smallest representable number
log_total_numerator = logINF;
bytelogscale = 1024.0 / 10000.5 / M_LN10;
nbestformat = 0;
noheader = 0;
# tag to identify nbest list in output_posteriors
nbest_tag = 1;
}
function log10(x) {
return log(x)/M_LN10;
}
function exp10(x) {
if (x <= logINF) {
return 0;
} else {
return exp(x * M_LN10);
}
}
function addlogs(x,y) {
if (x<y) {
temp = x; x = y; y = temp;
}
return x + log10(1 + exp10(y - x));
}
# by default, use posterior scale = lmw
NR == 1 {
if (!postscale) {
if (lmw == 0) {
postscale = 1.0;
} else {
postscale = lmw;
}
}
}
$1 ~ /^NBestList1\.0/ {
nbestformat = 1;
if (!noheader) {
# keep header in output
print;
}
if (lmw != 0 || wtw != 0) {
print "warning: cannot apply LMW or WTW to Decipher N-nbest lists" \
>> "/dev/stderr";
}
next;
}
$1 ~ /^NBestList2\.0/ {
nbestformat = 2;
if (!noheader) {
# keep header in output
print;
}
next;
}
NF > 1 {
if (max_nbest && num_hyps == max_nbest) exit;
num_hyps ++;
if (nbestformat == 1) {
# for Decipher nbest format 1 we use the aggregate score only
total_score = substr($1,2,length($1)-2);
total_score *= bytelogscale * amw/postscale;
} else if (nbestformat == 2) {
total_score = substr($1,2,length($1)-2);
# compute total AC and LM scores
lm_score = 0;
num_tokens = 0;
prev_end_time = -1;
for (i = 2; i <= NF; i += 11) {
start_time = $(i + 3);
end_time = $(i + 5);
# skip tokens that are subsumed by the previous word
# (this eliminates phone and state symbols)
# XXX: due to a bug in Decipher some state tags have incorrect
# timemarks. We filter them based on their token string.
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
num_tokens ++;
lm_score += $(i + 7);
prev_end_time = end_time;
}
}
# Compute AC score from total and lm scores. This takes into
# account that the recognizer might sum scores of equivalent hyps
# (e.g., those differing only in pauses or pronunciations) and
# reflect the summing in the total score, but not in the word AC
# scores.
ac_score = total_score - lm_score;
# Note we don't eliminate pause tokens from the word count, since
# the recognizer includes them in word count weighting.
# (Only after LM rescoring are pauses ignored.)
total_score = amw * ac_score + lmw * lm_score + wtw * num_tokens;
total_score *= bytelogscale/postscale;
} else {
total_score = (amw * $1 + lmw * $2 + wtw * $3)/postscale;
}
if (num_hyps == 1) {
score_offset = total_score;
}
total_score -= score_offset;
#
# store posteriors and hyp words
#
log_posteriors[num_hyps] = total_score;
log_total_numerator = addlogs(log_total_numerator, total_score);
num_words[num_hyps] = $3;
if (nbestformat > 0) {
$1 = "";
} else {
$1 = $2 = $3 = "";
}
hyps[num_hyps] = $0;
}
END {
for (i = 1; i <= num_hyps; i ++) {
unweighted_logpost = log_posteriors[i] - log_total_numerator;
logpost = log10(weight) + unweighted_logpost;
if (nbestformat > 0) {
printf "(%f) %s\n", logpost / bytelogscale, hyps[i];
} else {
print logpost, 0, num_words[i], hyps[i];
}
if (output_posteriors) {
print nbest_tag, i, unweighted_logpost >> output_posteriors;
}
}
}

View File

@@ -0,0 +1,316 @@
#!/bin/sh
#
# nbest-rover --
# Combine multiple nbest lists ROVER-style
#
# usage: nbest-rover SENTIDS CONTROL-FILE [POSTERIORS]
#
# where SENTIDS is list of sentence ids (filenames of nbest lists)
# if SENTIDS is "-" the list is inferred from the contents of
# the first N-best directory
# CONTROL-FILE describes the nbest list sets to be processed
# POSTERIORS is an an optional file to which word posterior probabilities
# are written.
#
# The format for CONTROL-FILE is
#
# DIR1 LMW1 WTW1 W1 [ N1 [ S1 ] ]
# DIR2 LMW2 WTW2 W2 [ N2 [ S2 ] ]
# ...
#
# Each DIRi names a directory in which nbest lists are to be found.
# LMWi and WTWi are the rescoring weights to be used for the corresponding
# directory. Wi is the weight to be given to the posteriors compute from
# the respective list. Ni are optional limits on the number N-best hyps used.
# Si are optional posterior scaling parameters.
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-rover,v 1.43 2019/02/28 04:48:21 stolcke Exp $
#
if [ $# -lt 2 ]; then
echo "usage: $0 [ sentid-list | - ] control-file [posteriors [nbest-lattice-options]]" >&2
exit 2
fi
sentids=$1
control=$2
shift; shift
# for new-style gnu sort
_POSIX2_VERSION=199209
export _POSIX2_VERSION
amw=1
default_lmw=8
default_wtw=0
default_scale=0
default_max_nbest=0
default_weight=1
mesh_option=-use-mesh
if [ $# -gt 0 ]; then
posteriors=$1
shift
else
posteriors=/dev/null
fi
lattice_dir=
posteriors_dir=
nbest_dir=
ref_posteriors=
filter_script=cat
missing_nbest=
use_nbest_scripts=
debug_level=0
null_nbest=${TMPDIR-/tmp}/$$null.nbest
# collect remaining options (mostly to pass them to nbest-lattice)
while [ $# -gt 0 ]; do
case "$1" in
-debug) debug_level=$2
shift; shift ;;
-amw) amw=$2;
shift; shift ;;
-write-dir) lattice_dir=$2
options="$options $1 $2"
shift; shift ;;
-write-nbest-dir)
nbest_dir=$2
options="$options $1 $2"
shift; shift ;;
-write-nbest-posteriors)
posteriors_dir=$2;
shift; shift ;;
-write-ref-posteriors)
ref_posteriors=$2;
options="$options -record-hyps"
shift; shift ;;
-no-mesh) mesh_option= ;
shift ;;
-wer) # -wer implies -no-mesh
mesh_option= ;
options="$options $1"
shift ;;
-missing-nbest)
echo "0 0 0" > $null_nbest
missing_nbest=1
use_nbest_scripts=1
shift ;;
-nbest-backtrace)
# Decipher2 format with backtrace info
# -- need to use old nbest helper scripts
options="$options $1"
use_nbest_scripts=1
shift ;;
-nbest-backtrace-times-only)
# Decipher 2 format - but only timing
# information is needed
helper_options="-nbest-backtrace -decipher-nbest"
options="$options -nbest-backtrace"
shift ;;
-filter) filter_script="$2";
shift; shift ;;
*) options="$options $1"
shift ;;
esac
done
> $posteriors
tmpdir=${TMPDIR-/tmp}
tmp_post=$tmpdir/post$$
tmp_sentids=$tmpdir/sentids$$
tmp_nbest_dir=$tmpdir/nbest.dir$$
tmp_post_dir=$tmpdir/post.dir$$
tmp_lat_dir=$tmpdir/lat.dir$$
trap "rm -rf $tmp_post $tmp_sentids $tmp_nbest_dir $tmp_post_dir $tmp_lat_dir $null_nbest; exit" 0 1 2 15
mkdir -p $tmp_nbest_dir $tmp_post_dir $tmp_lat_dir
#
# make sentid list if none was specified
#
if [ "$sentids" = "-" ]; then
${GAWK-gawk} '{ print $1; exit }' $control | xargs ls | \
sed -e 's,.*/,,' -e 's,\.gz$,,' -e 's,\.score$,,' | \
sort > $tmp_sentids
else
sort +0 -1 $sentids > $tmp_sentids
fi
set -e
#
# create lattice output directory if needed
#
if [ -n "$lattice_dir" ]; then
mkdir -p "$lattice_dir"
elif [ -n "$ref_posteriors" ]; then
lattice_dir=$tmp_lat_dir
options="$options -write-dir $lattice_dir"
fi
if [ -n "$nbest_dir" ]; then
mkdir -p "$nbest_dir"
fi
if [ -n "$posteriors_dir" ]; then
mkdir -p "$posteriors_dir"
elif [ -n "$ref_posteriors" ]; then
posteriors_dir=$tmp_post_dir
fi
cat $tmp_sentids | \
while read sentid refwords
do
extra_weights=
extra_scores=
extra_wts_and_scores=
noheader=0
nbest_tag=1
if [ -n "$posteriors_dir" ]; then
posteriors_file=$posteriors_dir/$sentid
> $posteriors_file
else
posteriors_file=
fi
if [ -n "$use_nbest_scripts" ]; then
# handle DOS EOL, comment and empty lines
sed -e 's,
$,,' -e '/^##/d' -e '/^[ ]*$/d' $control | \
while read dir lmw wtw weight max_nbest scale rest
do
if [ "$wtw" = "+" ]; then
if [ -f $dir/$sentid.gz ]; then
extra_scores="$extra_scores $dir/$sentid.gz"
extra_wts_and_scores="$extra_wts_and_scores $lmw $dir/$sentid.gz"
elif [ -f $dir/$sentid ]; then
extra_scores="$extra_scores $dir/$sentid"
extra_wts_and_scores="$extra_wts_and_scores $lmw $dir/$sentid"
else
echo "$dir/$sentid" is missing >&2
continue
fi
extra_weights="$extra_weights $lmw"
continue
else
if [ -f $dir/$sentid ]; then
nbest_file=$dir/$sentid
elif [ -f $dir/$sentid.gz ]; then
nbest_file=$dir/$sentid.gz
elif [ -f $dir/$sentid.score.gz ]; then
nbest_file=$dir/$sentid.score.gz
elif [ -f $dir/$sentid.score ]; then
nbest_file=$dir/$sentid.score
else
echo -n "$dir/$sentid.score.gz is missing" >&2
extra_weights=
extra_scores=
extra_wts_and_scores=
if [ -n "$missing_nbest" ]; then
echo " - using empty hyp" >&2
nbest_file=$null_nbest
else
echo "" >&2
continue
fi
fi
if [ "$weight" = "=" ]; then
weight=$last_weight
else
last_weight=$weight
fi
if [ -n "$extra_weights" -o "$amw" != 1 ]; then
combine-acoustic-scores \
-v "weights=$amw $extra_weights" \
-v max_nbest=${max_nbest:-$default_max_nbest} \
$nbest_file $extra_scores
else
gzip -dcf $nbest_file
fi | \
nbest-posteriors noheader=$noheader \
lmw=${lmw:-$default_lmw} \
wtw=${wtw:-$default_wtw} \
weight=${weight:-$default_weight} \
max_nbest=${max_nbest:-$default_max_nbest} \
postscale=${scale:-$default_scale} \
nbest_tag=$nbest_tag \
output_posteriors=$posteriors_file
extra_weights=
extra_scores=
extra_wts_and_scores=
noheader=1
nbest_tag=`expr $nbest_tag + 1`
fi
done
else # use helper tool
nbest-rover-helper -debug $debug_level \
-sentid $sentid \
-rover-control $control \
-max-nbest $default_max_nbest \
-rescore-amw $amw \
-rescore-lmw $default_lmw \
-rescore-wtw $default_wtw \
-posterior-weight $default_weight \
-posterior-scale $default_scale \
-write-posteriors "$posteriors_file" \
$helper_options
fi | \
eval "$filter_script" \
> $tmp_nbest_dir/$sentid
if [ -n "$posteriors_file" ]; then
gzip -f $posteriors_file
fi
echo $tmp_nbest_dir/$sentid
done | \
nbest-lattice -nbest-files - \
$mesh_option \
-rescore-lmw 0 -rescore-wtw 0 \
-posterior-amw 0 -posterior-lmw 0 -posterior-wtw 0 \
-debug 2 $options 2>$tmp_post | \
while read sentid hyp
do
# delete tmp nbest lists to avoid huge data accumulation
if [ "$sentid" != "$last_sentid" ]; then
rm -f $tmp_nbest_dir/$sentid
last_sentid=$sentid
fi
echo "$sentid $hyp"
done
if [ -n "$ref_posteriors" ]; then
> $ref_posteriors
cat $tmp_sentids | \
while read sentid refwords
do
if [ -f $lattice_dir/$sentid.gz ]; then
suffix=.gz
else
suffix=
fi
gzip -dcf $lattice_dir/$sentid$suffix | \
find-reference-posteriors sentid=$sentid \
posteriors_file=$posteriors_dir/$sentid$suffix >> $ref_posteriors
done
fi
# extract posteriors to file; output error messages; ignore others
${GAWK-gawk} '$2 == "post" { $2 = ""; print; next; }
$2 == "err" { next; }
{ print > "/dev/stderr"; }' $tmp_post > $posteriors

View File

@@ -0,0 +1,526 @@
/*
* nbest-rover-helper --
* Preprocess nbest lists for nbest-rover
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2010 SRI International, 2017 Andreas Stolcke, Microsoft Corp. All Rights Reserved.";
static char RcsId[] = "@(#)$Id: nbest-rover-helper.cc,v 1.10 2019/09/09 23:13:15 stolcke Exp $";
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <locale.h>
#include <assert.h>
#include <math.h>
#ifndef _MSC_VER
# include <unistd.h>
#endif
#include "option.h"
#include "version.h"
#include "File.h"
#include "Prob.h"
#include "Vocab.h"
#include "NBest.h"
#include "RefList.h"
#include "VocabMultiMap.h"
#include "MultiwordVocab.h" // for MultiwordSeparator
#include "Array.cc"
#include "MStringTokUtil.h"
#define DEBUG_ERRORS 1
#define DEBUG_POSTERIORS 2
/*
* default value for posterior* weights to indicate they haven't been set
*/
static int version = 0;
static unsigned debug = 0;
static char *vocabFile = 0;
static char *vocabAliasFile = 0;
static int toLower = 0;
static int multiwords = 0;
static const char *multiChar = MultiwordSeparator;
static int nbestBacktrace = 0;
static char *rescoreFile = 0;
static char *nbestFiles = 0;
static char *roverControlFile = 0;
static char *sentid = 0;
static char *writeNbestFile = 0;
static char *writeNbestDir = 0;
static int writeDecipherNbest = 0;
static unsigned maxNbest = 0;
static double rescoreAMW = 1.0;
static double rescoreLMW = 8.0;
static double rescoreWTW = 0.0;
static double posteriorScale = 0.0;
static double posteriorWeight = 1.0;
static int noPosteriors = 0;
static char *writePosteriors = 0;
static int nbestTag = 1;
static int optRest;
static Option options[] = {
{ OPT_TRUE, "version", &version, "print version information" },
{ OPT_UINT, "debug", &debug, "debugging level" },
{ OPT_STRING, "vocab", &vocabFile, "vocab file" },
{ OPT_STRING, "vocab-aliases", &vocabAliasFile, "vocab alias file" },
{ OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
{ OPT_TRUE, "multiwords", &multiwords, "split multiwords in N-best hyps" },
{ OPT_STRING, "multi-char", &multiChar, "multiword component delimiter" },
{ OPT_TRUE, "nbest-backtrace", &nbestBacktrace, "read backtrace info from N-best lists" },
{ OPT_STRING, "rescore", &rescoreFile, "hyp stream input file to rescore" },
{ OPT_STRING, "nbest", &rescoreFile, "same as -rescore" },
{ OPT_STRING, "nbest-files", &nbestFiles, "list of n-best filenames" },
{ OPT_STRING, "rover-control", &roverControlFile, "process nbest-rover control file" },
{ OPT_STRING, "sentid", &sentid, "sentence ID string for nbest-rover control file" },
{ OPT_STRING, "write-nbest", &writeNbestFile, "output n-best list" },
{ OPT_STRING, "write-nbest-dir", &writeNbestDir, "output n-best directory" },
{ OPT_TRUE, "decipher-nbest", &writeDecipherNbest, "output Decipher n-best format" },
{ OPT_UINT, "max-nbest", &maxNbest, "maximum number of hyps to consider" },
{ OPT_FLOAT, "rescore-amw", &rescoreAMW, "rescoring AM weight" },
{ OPT_FLOAT, "rescore-lmw", &rescoreLMW, "rescoring LM weight" },
{ OPT_FLOAT, "rescore-wtw", &rescoreWTW, "rescoring word transition weight" },
{ OPT_FLOAT, "posterior-scale", &posteriorScale, "divisor for log posterior estimates" },
{ OPT_FLOAT, "posterior-weight", &posteriorWeight, "overall weight of posterior probabilities" },
{ OPT_TRUE, "no-posteriors", &noPosteriors, "do not compute posterior probabilties (acoustic rescoring only)" },
{ OPT_STRING, "write-posteriors", &writePosteriors, "append posteriors probs to file" },
{ OPT_INT, "nbest-tag", &nbestTag, "subsystem tag number for posterior dump" },
{ OPT_REST, "-", &optRest, "indicate end of option list" },
{ OPT_DOC, 0, 0, "following options, an alternating list of weights and score files/directories" },
};
#ifdef _MSC_VER
# include <errno.h>
# include <sys/stat.h>
/*
* Emulate access(2) in Windows
*/
#define F_OK 0
#define R_OK 4
#define W_OK 2
#define X_OK 1
int
access(const char *path, int mode)
{
struct _stat buf;
if (_stat(path, &buf) < 0) {
return -1;
} else {
if (mode & R_OK && !(buf.st_mode & _S_IREAD)) {
errno = EPERM;
return -1;
}
if (mode & W_OK && !(buf.st_mode & _S_IWRITE)) {
errno = EPERM;
return -1;
}
if (mode & X_OK && !(buf.st_mode & _S_IEXEC)) {
errno = EPERM;
return -1;
}
return 0;
}
}
#endif /* _MSC_VER */
/*
* Read a list of scores from file
*/
Boolean
readScores(const char *filename, unsigned numHyps, unsigned maxN, Array<LogP2> &scores)
{
unsigned numScores = 0;
File file(filename, "r");
char *line;
while ((line = file.getline())) {
LogP2 score;
if (parseLogP(line, score)) {
scores[numScores ++] = score;
} else {
file.position() << "bad score value\n";
return false;
}
if (maxN > 0 && numScores == maxN) break;
}
if (numScores == numHyps || (maxN > 0 && numScores == maxN)) {
return true;
} else {
file.position() << "mismatched number of scores -- expecting "
<< numHyps << endl;
return false;
}
}
/*
* Process a single N-best list
*/
void
processNbest(Vocab &vocab, const char *sentid,
const char *nbestFile, unsigned maxN, Prob weight,
double LMW, double WTW, double postScale,
unsigned nScores, double scoreWeights[], const char *scoreFiles[],
File &outNbestFile, unsigned tag)
{
/*
* Process nbest list
*/
NBestList nbestList(vocab, maxN, multiwords ? multiChar : 0, nbestBacktrace);
nbestList.debugme(debug);
/*
* Posterior scaling: if not specified (= 0.0) use LMW for
* backward compatibility.
*/
if (postScale == 0.0) {
postScale = (LMW == 0.0) ? 1.0 : LMW;
}
if (debug > 0) {
cerr << "PROCESSING " << nbestFile
<< " maxn = " << maxN
<< " weight = " << weight
<< " lmw = " << LMW << " wtw = " << WTW
<< " scale = " << postScale
<< " extras =";
for (unsigned i = 0; i < nScores; i ++) {
cerr << " " << scoreWeights[i]
<< " " << scoreFiles[i];
}
cerr << endl;
}
if (nbestFile) {
File input(nbestFile, "r");
if (!nbestList.read(input)) {
cerr << "format error in nbest list\n";
exit(1);
}
} else {
File input(stdin);
if (!nbestList.read(input)) {
cerr << "format error in nbest list\n";
exit(1);
}
}
/*
* Apply AM weight
*/
if (rescoreAMW != 1.0) {
for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
nbestList.getHyp(i).acousticScore *= rescoreAMW;
}
}
/*
* Add extra scores into AM score
*/
for (unsigned j = 0; j < nScores; j ++) {
if (scoreWeights[j] != 0.0) {
Array<LogP2> extraScores;
if (!readScores(scoreFiles[j], nbestList.numHyps(), maxN, extraScores)) {
exit(1);
}
for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
nbestList.getHyp(i).acousticScore += scoreWeights[j] * extraScores[i];
}
}
}
if (!noPosteriors) {
/*
* compute log posteriors
*/
nbestList.computePosteriors(LMW, WTW, postScale, 1.0, true);
LogP logWeight = ProbToLogP(weight);
File posteriorFile;
if (writePosteriors && *writePosteriors) {
posteriorFile.reopen(writePosteriors, "a");
}
/*
* Encode log posteriors as acoustic scores, for output purposes
* Also, dump posterior to a separate file if requested
*/
for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
nbestList.getHyp(i).acousticScore = nbestList.getHyp(i).posterior;
nbestList.getHyp(i).languageScore = 0.0;
nbestList.getHyp(i).totalScore = nbestList.getHyp(i).acousticScore;
if (writePosteriors && *writePosteriors) {
/* from nbest-posteriors.gawk:
* print nbest_tag, i, unweighted_logpost >> output_posteriors;
*/
posteriorFile.fprintf("%d %d %.*lg\n", tag, i+1,
Prob_Precision, (double)nbestList.getHyp(i).posterior);
}
nbestList.getHyp(i).acousticScore += logWeight;
}
}
nbestList.write(outNbestFile, writeDecipherNbest);
}
int
main (int argc, char *argv[])
{
setlocale(LC_CTYPE, "");
setlocale(LC_COLLATE, "");
argc = Opt_Parse(argc, argv, options, Opt_Number(options),
OPT_OPTIONS_FIRST);
/*
* Ensure arguments are in pairs (weight, scorefile)
*/
if ((argc-1) % 2 == 1) {
cerr << "number of arguments is not even (alternating weights and score files)\n";
exit(2);
}
unsigned nExtraScores = (argc-1)/2;
makeArray(double, scoreWeights, nExtraScores);
makeArray(const char *, scoreFiles, nExtraScores);
for (unsigned i = 0; i < nExtraScores; i ++) {
if (sscanf(argv[2*i + 1], "%lf", &scoreWeights[i]) != 1) {
cerr << "bad score weight " << argv[2*i + 1] << endl;
exit(2);
}
scoreFiles[i] = argv[2*i + 2];
}
if (version) {
printVersion(RcsId);
exit(0);
}
Vocab vocab;
vocab.toLower() = toLower ? true : false;
if (vocabFile) {
File file(vocabFile, "r");
vocab.read(file);
}
if (vocabAliasFile) {
File file(vocabAliasFile, "r");
vocab.readAliases(file);
}
File outFile(stdout);
/*
* Process single nbest file
*/
if (rescoreFile) {
if (writeNbestFile) {
outFile.reopen(writeNbestFile, "w");
}
processNbest(vocab, 0, rescoreFile, maxNbest, posteriorWeight,
rescoreLMW, rescoreWTW, posteriorScale,
nExtraScores, scoreWeights, scoreFiles,
outFile, nbestTag);
if (writeNbestFile) {
outFile.close();
}
}
/*
* Process list of nbest filenames
*/
if (nbestFiles) {
File file(nbestFiles, "r");
char *line;
while ((line = file.getline())) {
char *strtok_ptr = NULL;
char *fname = MStringTokUtil::strtok_r(line, wordSeparators, &strtok_ptr);
if (!fname) continue;
RefString sentid = idFromFilename(fname);
/*
* Construct score file names from directory path and sentid
*/
makeArray(char *, scoreFileNames, nExtraScores);
for (unsigned i = 0; i < nExtraScores; i ++) {
scoreFileNames[i] = new char[strlen(scoreFiles[i]) + 1 + strlen(sentid) + strlen(GZIP_SUFFIX) + 1];
sprintf(scoreFileNames[i], "%s/%s%s", scoreFiles[i], sentid,
GZIP_SUFFIX);
}
/*
* Construct output file names from directory path and sentid
*/
makeArray(char, writeNbestName,
(writeNbestDir ? strlen(writeNbestDir) : 0) + 1
+ strlen(sentid) + strlen(GZIP_SUFFIX) + 1);
if (writeNbestDir) {
sprintf(writeNbestName, "%s/%s%s", writeNbestDir, sentid, GZIP_SUFFIX);
outFile.reopen(writeNbestName, "r");
}
processNbest(vocab, sentid, fname, maxNbest, posteriorWeight,
rescoreLMW, rescoreWTW, posteriorScale,
nExtraScores, scoreWeights, (const char **)(char **)scoreFileNames,
outFile, nbestTag);
if (writeNbestDir) {
outFile.close();
}
for (unsigned i = 0; i < nExtraScores; i ++) {
delete [] scoreFileNames[i];
}
}
}
/*
* Process rover control file
*/
if (roverControlFile) {
if (!sentid) {
cerr << "no -sentid specified with rover control file\n";
exit(2);
}
File roverControl(roverControlFile, "r");
if (writeNbestFile) {
outFile.reopen(writeNbestFile, "w");
}
Array<char *> extraScores;
Array<double> extraWeights;
unsigned nExtraScores = 0;
Prob lastWeight = 1.0;
const char *scoreSuffix = ".score";
char *line;
while ((line = roverControl.getline())) {
char scoreDir[256], plus[10];
double lmw = rescoreLMW, wtw = rescoreWTW, postScale = posteriorScale;
unsigned maxN = maxNbest;
Prob weight = posteriorWeight;
char weightStr[30];
unsigned nparsed;
/*
* nbest-rover:
* read dir lmw wtw weight max_nbest scale rest
*/
if (sscanf(line, "%255s %lf %9s", scoreDir, &lmw, plus) == 3 && strcmp(plus, "+") == 0) {
extraScores[nExtraScores] = new char[strlen(scoreDir) + 1 + strlen(sentid) + strlen(GZIP_SUFFIX) + 1];
sprintf(extraScores[nExtraScores], "%s/%s%s", scoreDir, sentid, GZIP_SUFFIX);
if (access(extraScores[nExtraScores], R_OK) < 0) {
sprintf(extraScores[nExtraScores], "%s/%s", scoreDir, sentid);
if (access(extraScores[nExtraScores], R_OK) < 0) {
roverControl.position() << "no score file for sentid " << sentid << endl;
for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
nExtraScores = 0;
continue;
}
}
extraWeights[nExtraScores] = lmw;
nExtraScores ++;
} else if ((nparsed = sscanf(line, "%255s %lf %lf %29s %u %lf", scoreDir, &lmw, &wtw, weightStr, &maxN, &postScale)) >= 1) {
char *nbestFile = new char[strlen(scoreDir) + 1 + strlen(sentid) + strlen(scoreSuffix) + strlen(GZIP_SUFFIX) + 1];
sprintf(nbestFile, "%s/%s%s", scoreDir, sentid, GZIP_SUFFIX);
if (access(nbestFile, R_OK) < 0) {
sprintf(nbestFile, "%s/%s", scoreDir, sentid);
if (access(nbestFile, R_OK) < 0) {
sprintf(nbestFile, "%s/%s%s%s", scoreDir, sentid, scoreSuffix, GZIP_SUFFIX);
if (access(nbestFile, R_OK) < 0) {
sprintf(nbestFile, "%s/%s%s", scoreDir, sentid, scoreSuffix);
if (access(nbestFile, R_OK) < 0) {
roverControl.position() << "no nbest file for sentid " << sentid << endl;
for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
nExtraScores = 0;
delete [] nbestFile;
continue;
}
}
}
}
if (nparsed >= 4 && strcmp(weightStr, "=") == 0) {
weight = lastWeight;
} else {
if (!parseProb(weightStr, weight)) {
roverControl.position() << "bad weight value " << weightStr << endl;
weight = 0.0;
}
lastWeight = weight;
}
/*
* No combine all the files
*/
processNbest(vocab, sentid, nbestFile, maxN, weight,
lmw, wtw, postScale,
nExtraScores, extraWeights, (const char **)(char **)extraScores,
outFile, nbestTag);
for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
nExtraScores = 0;
delete [] nbestFile;
nbestTag ++;
} else {
roverControl.position() << "bad format in control file\n";
exit(1);
}
}
if (writeNbestFile) {
outFile.close();
}
}
exit(0);
}

View File

@@ -0,0 +1,59 @@
#!/usr/local/bin/gawk -f
#
# nbest-vocab --
# extract vocabulary used in nbest lists
#
# usage: nbest-vocab NBEST-FILE ... > VOCAB
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-vocab.gawk,v 1.2 2003/03/18 00:55:07 stolcke Exp $
#
BEGIN {
nbestformat = 0;
}
$1 ~ /^NBestList1\.0/ {
nbestformat = 1;
next;
}
$1 ~ /^NBestList2\.0/ {
nbestformat = 2;
next;
}
NF > 1 {
if (nbestformat == 1) {
# for Decipher nbest format 1 we use the aggregate score only
for (i = 2; i <= NF; i ++) {
is_word[$i] = 1;
}
} else if (nbestformat == 2) {
prev_end_time = -1;
for (i = 2; i <= NF; i += 11) {
start_time = $(i + 3);
end_time = $(i + 5);
# skip tokens that are subsumed by the previous word
# (this eliminates phone and state symbols)
# XXX: due to a bug in Decipher some state tags have incorrect
# timemarks. We filter them based on their token string.
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
is_word[$i] = 1;
prev_end_time = end_time;
}
}
} else {
for (i = 4; i <= NF; i ++) {
is_word[$i] = 1;
}
}
}
END {
for (word in is_word) {
print word;
}
}

View File

@@ -0,0 +1,55 @@
#!/usr/local/bin/gawk -f
#
# nbest-words --
# extract words only nbest lists
#
# usage: nbest-words NBEST-FILE ...
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-words.gawk,v 1.1 2016/04/29 04:00:08 stolcke Exp $
#
BEGIN {
nbestformat = 0;
}
$1 ~ /^NBestList1\.0/ {
nbestformat = 1;
next;
}
$1 ~ /^NBestList2\.0/ {
nbestformat = 2;
next;
}
NF > 1 {
words = "";
if (nbestformat == 1) {
for (i = 2; i <= NF; i ++) {
words = words " " $i;
}
} else if (nbestformat == 2) {
prev_end_time = -1;
for (i = 2; i <= NF; i += 11) {
start_time = $(i + 3);
end_time = $(i + 5);
# skip tokens that are subsumed by the previous word
# (this eliminates phone and state symbols)
# XXX: due to a bug in Decipher some state tags have incorrect
# timemarks. We filter them based on their token string.
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
words = words " " $i
prev_end_time = end_time;
}
}
} else {
for (i = 4; i <= NF; i ++) {
words = words " " $i;
}
}
print words;
}

View File

@@ -0,0 +1,37 @@
#!/usr/local/bin/gawk -f
#
# nbest2-to-nbest1 --
# Convert Decipher NBestList2.0 format to NBestList1.0 format
#
# $Header: /home/srilm/CVS/srilm/utils/src/nbest2-to-nbest1.gawk,v 1.4 2004/11/02 02:00:35 stolcke Exp $
#
BEGIN {
magic1 = "NBestList1.0";
magic2 = "NBestList2.0";
}
NR == 1 {
if ($0 != magic2) {
print "Input not in " magic2 " format" >> "/dev/stderr";
exit 1;
}
print magic1;
next;
}
{
prev_end_time = -1;
line = $1;
for (i = 2; i <= NF; i += 11) {
start_time = $(i + 3);
end_time = $(i + 5);
# skip tokens that are subsumed by the previous word
# (this eliminates phone and state symbols)
# XXX: due to a bug in Decipher some state tags have incorrect
# timemarks. We filter them based on their token string.
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
line = line " " $i;
prev_end_time = end_time;
}
}
print line;
}

View File

@@ -0,0 +1,23 @@
#!/bin/sh
#
# pfsg-from-ngram --
# Convert a bigram or trigram into a Decipher PFSG
#
# This is a wrapper that takes care of
# - eliminating low probability transitions that the recognizer would never use
# - renormalizing the LM
# - converting to PFSG
# - adding pauses between words
#
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-from-ngram,v 1.3 2000/02/04 00:20:32 stolcke Exp $
#
# get LM from first argument, pass rest to ngram
# default LM is stdin
lm=${1--}
test $# -gt 0 && shift
ngram -debug 1 -prune-lowprobs -lm "$lm" "$@" -write-lm - | \
make-ngram-pfsg | \
add-pauses-to-pfsg

View File

@@ -0,0 +1,87 @@
#!/usr/local/bin/gawk -f
#
# pfsg-to-dot --
# Generate dot(1) graph description from PFSG
#
# usage: pfsg-to-dot [show_probs=1] [show_nums=1] file.pfsg > file.dot
#
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-to-dot.gawk,v 1.5 2003/07/10 21:09:15 stolcke Exp $
#
BEGIN {
show_probs = 0;
show_logs = 0;
show_nums = 0;
in_a_pfsg = 0;
logscale = 10000.5;
}
function bytelog2prob(p) {
x = p / logscale;
if (x < -7e2) {
return 0;
} else {
return exp(x);
}
}
function bytelog2log10(p) {
return p / logscale / 2.30258509299404568402;
}
$1 == "name" {
name = $2;
# handle repeated PFSGs in the same file
if (in_a_pfsg)
print "} digraph \"" name "\" {";
else
print "digraph \"" name "\" {";
print "rankdir = LR";
dotrans = 0;
in_a_pfsg = 1;
}
function node_label(w, i) {
if (show_nums) {
return w "\\n" i;
} else {
return w;
}
}
$1 == "nodes" {
numnodes = $2;
for (i = 0; i < numnodes; i ++) {
print "\tnode" i " [label=\"" $(i + 3) \
(show_nums ? "\\n" i : "") "\"];"
}
}
$1 == "initial" {
i = $2;
# print "\tnode" i " [label=\"START\"];"
}
$1 == "final" {
i = $2;
# print "\tnode" i " [label=\"END\"];"
}
$1 == "transitions" {
dotrans = 1;
next;
}
dotrans && NF == 3 {
from = $1;
to = $2;
prob = $3;
print "\tnode" from " -> node" to \
(!(show_probs || show_logs) ? "" :
" [label=\"" (show_logs ? bytelog2log10(prob) :
bytelog2prob(prob)) "\"]") ";"
}
END {
print "}"
}

View File

@@ -0,0 +1,146 @@
#!/usr/local/bin/gawk -f
#
# pfsg-to-fsm --
# convert a Decipher PFSG to AT&T FSM format
#
# usage: pfsg-to-fsm [symbolfile=SYMFILE] [symbolic=1] [scale=S] file.pfsg > file.fsm
#
# symbolic=1 retains output word strings in the fsm file.
# symbolfile=SYMFILE dump output symbol table to SYMFILE
# (to be used with fsmcompile|fsmdraw|fsmprint -i SYMFILE)
# scale=S set transition weight scaling factor to S
# (default -1)
#
#
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-to-fsm.gawk,v 1.16 2015-07-03 03:45:38 stolcke Exp $
#
BEGIN {
empty_output = "NULL";
output_symbols[empty_output] = 0;
numoutputs = 1;
if ("TMPDIR" in ENVIRON) {
tmpdir = ENVIRON["TMPDIR"];
} else {
tmpdir = "/tmp"
}
if ("pid" in PROCINFO) {
pid = PROCINFO["pid"];
} else {
getline pid < "/dev/pid";
}
tmpfile = tmpdir "/pfsg.tmp" pid;
# hack to remove tmpfile when killed
trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
print "" | trap_cmd;
symbolfile = "";
symbolic = 0;
scale = -1; # scaling of transition weights
nofinal = 0; # do output final node definition
final_output = "";
}
$1 == "nodes" {
numnodes = $2;
for (i = 0; i < numnodes; i++) {
node_output[i] = $(i + 3);
if (!(node_output[i] in output_symbols)) {
output_symbols[node_output[i]] = numoutputs++;
}
}
next;
}
$1 == "initial" {
initial_node = $2;
if (node_output[initial_node] != empty_output) {
print "initial node must be NULL" >> "/dev/stderr";
exit 1;
}
next;
}
$1 == "final" {
final_node = $2;
if (final_output) {
node_output[final_node] = final_output;
if (!(final_output in output_symbols)) {
output_symbols[final_output] = numoutputs++;
}
}
next;
}
function print_trans(from_node, to_node, cost) {
if (to_node == final_node && node_output[final_node] == empty_output) {
print from_node, scale * cost;
} else {
# PFSG bytelogs have to be negated to FSM default semiring
print from_node, to_node, \
(symbolic ? node_output[to_node] : \
output_symbols[node_output[to_node]]), \
scale * cost;
}
}
function print_final() {
# if the final node is non-emitting, we don't need to output it
# at all (see print_trans above)
if (!nofinal && node_output[final_node] != empty_output) {
print final_node, 0;
}
}
$1 == "transitions" {
num_transitions = $2;
# process the transitions and map them to FSM transitions and
# final states.
# FSM requires the first transition to be out of the initial state,
# so we scan the transitions twice.
# The first time, to find the initial transitions, then
# to add all the others. Yuck!
for (k = 1; k <= num_transitions; k ++) {
getline;
from_node = $1;
to_node = $2;
cost = $3;
if (from_node == initial_node) {
print_trans(from_node, to_node, cost);
} else {
print > tmpfile;
}
}
close(tmpfile);
# output definition of the final node
print_final();
# now process all the non-initial transitions
while (getline < tmpfile) {
from_node = $1;
to_node = $2;
cost = $3;
print_trans(from_node, to_node, cost);
}
next;
}
END {
# dump out the symbol table
if (symbolfile) {
for (s in output_symbols) {
print s, output_symbols[s] > symbolfile;
}
}
}

View File

@@ -0,0 +1,35 @@
#!/usr/local/bin/gawk -f
#
# pfsg-vocab --
# extract vocabulary used in PFSG
#
# usage: pfsg-vocab PFSG-FILE ... > VOCAB
#
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-vocab.gawk,v 1.1 2003/02/18 18:33:04 stolcke Exp $
#
BEGIN {
null = "NULL";
}
$1 == "nodes" {
for (i = 3; i <= NF; i ++) {
if ($i != null) {
is_word[$i] = 1;
}
}
next;
}
$1 == "name" {
# sub-pfsg names are not words, and might have been added during the
# processing of the nodes list
delete is_word[$2];
}
END {
for (word in is_word) {
print word;
}
}

View File

@@ -0,0 +1,55 @@
#!/usr/local/bin/gawk -f
#
# ppl-from-log --
# Recomputes perplexity from (a subset of) the output of
#
# ngram -debug 2 -ppl
#
# This is useful if one wants to analyse predictability of certain
# words/contexts.
#
# usage: ppl-from-log [howmany=<numsents>] ppl-log-file
#
# Copyright (c) 1995, SRI International. All Rights Reserved
#
# $Header: /home/srilm/CVS/srilm/utils/src/ppl-from-log.gawk,v 1.4 2014-07-03 05:57:09 stolcke Exp $
#
function result () {
ppl = exp(-sum/(sentences + words - oovs) * M_LN10);
printf "file %s: %d sentences, %d words, %d oovs\n", \
FILENAME, sentences, words, oovs;
printf "%d zeroprobs, logprob= %f, ppl= %f\n", \
0, sum , ppl;
}
BEGIN {
M_LN10 = 2.30258509299404568402; # from <math.h>
}
/^ p\( / {
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) {
oovs ++;
} else {
sum += $10;
}
if ($2 == "</s>") {
sentences ++;
} else {
words ++;
}
next;
}
/ ppl= / {
sents ++;
if (howmany > 0 && sents == howmany) {
result();
exit 0;
}
next;
}
{
next;
}
END {
result();
}

View File

@@ -0,0 +1,34 @@
#!/usr/local/bin/gawk -f
#
# Map words in a text file to zero of more expansions
#
# $Header: /home/srilm/CVS/srilm/utils/src/prettify.gawk,v 1.1 2001/03/24 06:41:31 stolcke Exp $
#
NR == 1 {
# read pretty map file
if (map) {
while ((getline mapline < map) > 0) {
npretty = split(mapline, pretty_list);
word = pretty_list[1];
pretty_map[word] = "";
for (i = 2; i <= npretty; i ++) {
pretty_map[word] = pretty_map[word] " " pretty_list[i];
}
}
}
}
function pretty_up() {
for (i = 1; i <= NF; i ++) {
if ($i in pretty_map) {
$i = pretty_map[$i];
}
if (multiwords) gsub("_", " ", $i);
}
}
{
pretty_up();
print;
}

View File

@@ -0,0 +1,141 @@
#!/usr/local/bin/gawk -f
#
# rank-vocab --
# Given K different rankings of candidate vocabularies, and
# a held-out optimization unigram count file, optimize the
# combined ranking of words
#
# usage: rank-vocab counts words1 words2 ... worksK
#
# $Header: /home/srilm/CVS/srilm/utils/src/rank-vocab.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
#
BEGIN {
num_sources = 0;
num_output = 0;
num_oovs = 0;
debug = 0;
}
ARGIND == 1 {
word_count[$1] = $2;
num_oovs += $2;
next;
}
ARGIND > 1 {
k = ARGIND - 1;
num_sources = k;
num_words[k] ++;
word_ranked[k, num_words[k]] = $1;
next;
}
function dump_words(k) {
print "source " k " words:";
for (i = 1; i <= num_words[k]; i ++) {
print i, word_ranked[k,i];
}
}
# find the next word from source k that occurs in the test set
# return 0 if no more words are available
function find_next(k) {
for (j = last_chosen[k] + 1; j <= num_words[k]; j ++) {
if (word_count[word_ranked[k,j]] > 0) {
if (debug) {
print "next word rank for source " k ": " j >> "/dev/stderr";
}
return j;
}
}
if (debug) {
print "no more words from source " k >> "/dev/stderr";
}
return 0;
}
# compute gain (number of OOVs tokens reduced per number of word types added)
# by adding the next word from source k
function compute_gain(k) {
if (next_word[k] == 0) {
# no more words in source k, no gain
return -1;
} else {
g = word_count[word_ranked[k,next_word[k]]] / (next_word[k] - last_chosen[k]);
if (debug) {
print "next gain for source " k " = " g;
}
return g;
}
}
END {
# for (k = 1; k <= num_sources; k ++) {
# dump_words(k);
# }
for (k = 1; k <= num_sources; k ++) {
last_chosen[k] = 0;
next_word[k] = find_next(k);
gain[k] = compute_gain(k);
}
print "INITIAL OOVS = " num_oovs;
# add words until no more gain possible (i.e., until all source
# words have been used up)
while (1) {
best_gain = -1;
best_source = 0;
# find next best source to pick word from
for (k = 1; k <= num_sources; k ++) {
if (gain[k] > best_gain) {
best_source = k;
best_gain = gain[k];
}
}
if (best_gain < 0) break;
# process all the words from source k up to the one chosen
for (i = last_chosen[best_source] + 1; \
i <= next_word[best_source]; \
i ++) {
word_chosen = word_ranked[best_source,i]
if (debug) {
print "source = " best_source \
" gain = " best_gain \
" word = " word_chosen >> "/dev/stderr";
}
# output the word if it hasn't been already
if (!was_output[word_chosen]) {
num_output ++;
num_oovs -= word_count[word_chosen];
print "RANK " num_output " WORD " word_chosen \
" OOVS " num_oovs;
was_output[word_chosen] = 1;
}
}
# update the statistics for the source that was chosen
last_chosen[best_source] = next_word[best_source];
next_word[best_source] = find_next(best_source);
gain[best_source] = compute_gain(best_source);
}
}

View File

@@ -0,0 +1,106 @@
#!/usr/local/bin/gawk -f
#
# remove-lowprob-ngrams --
# Remove ngrams from a backoff LM that have lower prob than their
# backoff paths.
#
# $Header: /home/srilm/CVS/srilm/utils/src/remove-lowprob-ngrams.gawk,v 1.4 2004/11/02 02:00:35 stolcke Exp $
#
NF == 0 {
print;
next;
}
/^ngram *[0-9][0-9]*=/ {
order = substr($2,1,index($2,"=")-1);
if (order > 3) {
print "warning: can only handle bigrams and trigrams" >> "/dev/stderr";
}
if (order > maxorder && $2 !~ /=0$/) {
maxorder = order;
}
print;
next;
}
/^\\[0-9]-grams:/ {
currorder=substr($0,2,1);
print;
next;
}
/^\\/ {
print;
next;
}
#
# unigrams
#
currorder == 1 {
word = $2;
uni_prob[word] = $1;
if (NF > 2) {
uni_bow[word] = $3;
}
print;
}
#
# bigrams
#
currorder == 2 {
prob = $1;
word1 = $2;
word2 = $3;
words = $2 " " $3;
if (maxorder > 2) {
bi_prob[words] = prob;
if (NF > 3) {
bi_bow[words] = $4;
}
}
total_bigrams ++;
if (uni_bow[word1] + uni_prob[word2] <= prob) {
print;
} else {
removed_bigrams ++;
}
}
#
# trigrams
#
currorder == 3 {
prob = $1;
word1 = $2;
word2 = $3;
word3 = $4;
if (word2 " " word3 in bi_prob) {
backoff_prob = bi_bow[word1 " " word2] + bi_prob[word2 " " word3];
} else {
backoff_prob = bi_bow[word1 " " word2] + \
uni_bow[word2] + uni_prob[word3];
}
total_trigrams ++;
if (backoff_prob <= prob) {
print;
} else {
removed_trigrams ++;
}
}
END {
if (total_bigrams > 0) {
printf "%d out of %d bigrams removed\n", \
removed_bigrams, total_bigrams >> "/dev/stderr";
}
if (total_trigrams > 0) {
printf "%d out of %d trigrams removed\n", \
removed_trigrams, total_trigrams >> "/dev/stderr";
}
}

View File

@@ -0,0 +1,41 @@
#!/usr/local/bin/gawk -f
#
# replace-unk-words --
# replace OOV words with <unk> tag
#
# usage: replace-unk-words vocab=<vocabfile> text > text-with-unk
#
# $Header: /home/srilm/CVS/srilm/utils/src/replace-unk-words.gawk,v 1.1 2013/12/11 08:32:48 stolcke Exp $
#
BEGIN {
unk = "<unk>";
}
NR == 1 {
if (vocab != "") {
nwords = 0;
while ((getline line < vocab) > 0) {
if (split(line, w, " ") > 0) {
is_word[w[1]] = 1;
nwords += 1;
}
}
close(vocab);
print "read " nwords " words" > "/dev/stderr";
}
is_word[unk] = 1;
is_word["<s>"] = 1;
is_word["</s>"] = 1;
}
{
for (i = 1; i <= NF; i ++) {
if (!($i in is_word)) {
$i = unk;
}
}
print;
}

View File

@@ -0,0 +1,223 @@
#!/usr/local/bin/gawk -f
#
# replace-with-words-classes --
# replace class expansions with class names
#
# usage: replace-with-words-classes classes=<classfile> text > text-with-classes
# replace-with-words-classes classes=<classfile> have_counts=1 counts \
# > counts-with-classes
#
# optional arguments:
# outfile=<file> output file for class expansion counts (default: none)
# normalize=<0|1> normalize counts to probabilities (default = 1)
# addone=<count> value to add to counts for probability smoothing (1)
#
# $Header: /home/srilm/CVS/srilm/utils/src/replace-words-with-classes.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $
#
function read_classes(file) {
num_class_defs = 0;
delete num_class_expansions;
delete class_expansions;
delete class_expansion_probs;
while ((getline line < file) > 0) {
n = split(line, a);
if (n == 0) continue;
class = a[1];
num_exp = ++ num_class_expansions[class];
if (a[2] ~ /^[-+0-9.][-+0-9e.]*$/) {
prob = a[2];
i = 3;
} else {
prob = "";
i = 2;
}
expansion = a[i];
for (i++; i <= n; i++) {
expansion = expansion " " a[i];
}
class_expansions[class " " num_exp] = expansion;
if (prob != "") {
class_expansion_probs[class " " num_exp] = prob;
}
num_class_defs ++;
}
print "read " num_class_defs " class expansions" >> "/dev/stderr";
# assign default expansion probs
for (class in num_class_expansions) {
num_exp = num_class_expansions[class];
for (i = 1; i <= num_exp; i ++) {
if (class_expansion_probs[class " " i] == "") {
class_expansion_probs[class " " i] = 1/num_exp;
}
}
}
}
##############################################################################
function add_to_prefix_tree(class, expansion, prob) {
nwords = split(expansion, w);
node = 0;
for (k = 1; k <= nwords; k ++) {
next_node = tree[node " " w[k]];
if (!next_node) {
next_node = ++num_nodes;
tree[node " " w[k]] = next_node;
}
node = next_node;
}
if (!(node in node_class)) {
node_class[node] = class;
node_prob[node] = prob;
}
return node;
}
BEGIN {
normalize = 1;
addone = 1;
partial = 0;
}
NR == 1 {
if (classes) {
read_classes(classes);
close(classes);
} else {
print "no classes file specified" >> "/dev/stderr";
}
for (class in num_class_expansions) {
for (i = 1; i <= num_class_expansions[class]; i ++) {
class_expansion_node[class " " i] = \
add_to_prefix_tree(class, class_expansions[class " " i], \
class_expansion_probs[class " " i]);
}
}
}
{
output = "";
next_pos = 1;
# partial option: multiple spaces block multiword replacement
if (partial) {
gsub("[ ][ ]*[ ]", " | ");
}
#
# handle ngram counts by simply leaving the count value alone
# and doing substitution on the ngram itself.
#
if (have_counts) {
max_pos = NF - 1;
} else {
max_pos = NF;
}
while (next_pos <= max_pos) {
class = "";
prob = 0;
num_exp_words = 0;
# search for largest class expansion starting at current position
node = 0;
k = 0;
while (1) {
node = tree[node " " $(next_pos + k)];
if (node) {
if (node in node_class) {
# we have found a complete expansion, record its class
class = node_class[node];
class_node = node;
prob = node_prob[prob];
num_exp_words = k + 1;
}
} else {
break;
}
k ++;
}
if (next_pos == 1) {
space = "";
} else {
space = " ";
}
if (!class) {
output = output space $next_pos;
next_pos ++;
} else {
output = output space class;
next_pos += num_exp_words;
node_count[class_node] ++;
class_count[class] ++;
}
}
# partial option: multiple spaces block multiword replacement
if (partial) {
gsub(" [|] ", " ", output);
sub("^[|]", " ", output);
sub("[|]$", " ", output);
}
if (have_counts) {
print output, $NF;
} else {
print output;
}
}
function estimate(count, total, N) {
denom = total + N *addone;
if (denom == 0) {
return 0;
} else {
return (count + addone)/denom;
}
}
END {
if (outfile) {
for (class in num_class_expansions) {
for (i = 1; i <= num_class_expansions[class]; i ++) {
nc = node_count[class_expansion_node[class " " i]] + 0;
print class, \
normalize ? \
estimate(nc, class_count[class], \
num_class_expansions[class]) :
nc, \
class_expansions[class " " i] > outfile;
}
}
close(outfile);
}
}

View File

@@ -0,0 +1,70 @@
#!/bin/sh
#
# rescore-acoustic --
# Replace acoustic Nbest scores with a weighted combination of
# old and new acoustic scores
#
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-acoustic,v 1.8 2015-07-03 03:45:39 stolcke Exp $
#
if [ $# -lt 5 ]; then
echo "usage: $0 old-nbest-dir old-ac-weight new-score-dir1 new-ac-weight1 new-score-dir2 new-ac-weight2 ... new-nbest-dir [max-nbest]" >&2
echo " or $0 old-file-list old-ac-weight new-score-dir1 new-ac-weight1 new-score-dir2 new-ac-weight2 ... new-nbest-dir [max-nbest]" >&2
exit 1
fi
old_nbest=${1}
old_acw=${2}
shift; shift
new_scores=
new_acw=
while [ $# -ge 3 ]
do
new_scores="$new_scores $1"
new_acw="$new_acw $2"
shift; shift
done
new_nbest=${1}
max_nbest=${2-0}
set -e
tmpdir=${TMPDIR-/tmp}
join1="$tmpdir/join1_$$"
join2="$tmpdir/join2_$$"
trap "rm -f $join1 $join2" 0 1 2 15
echo "generating sentids ..." >&2
if [ -d $old_nbest ]; then
find $old_nbest/. -follow -type f -print
else
cat $old_nbest
fi | \
sed -e 's,.*,& &,' -e 's,[^ ]*/,,' -e 's,\.gz , ,' -e 's,\.score , ,' | \
sort -k 1,1 > $join1
echo "`wc -l < $join1` utterances" >&2
for d in $new_scores
do
echo "joining $d ..." >&2
find $d/. -follow -type f -print | \
sed -e 's,.*,& &,' -e 's,[^ ]*/,,' -e 's,\.gz , ,' |\
sort -k 1,1 | \
/usr/local/gnu/bin/join $join1 - > $join2
mv $join2 $join1
done
echo "`wc -l < $join1` utterances after joining" >&2
mkdir -p $new_nbest
cat $join1 | \
while read sentid scorefiles
do
echo $sentid >&2
combine-acoustic-scores -v "weights=$old_acw $new_acw" \
-v max_nbest=$max_nbest $scorefiles | \
gzip > $new_nbest/$sentid.score.gz
done

View File

@@ -0,0 +1,466 @@
#!/bin/sh
#
# rescore-nbest --
# generate scores from Decipher(TM) n-best lists
#
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-decipher,v 1.40 2017/07/20 05:43:59 stolcke Exp $
#
bytelog=0
nodecipherlm=0
multiwords=0
norescore=0
decipher_lmw=8
decipher_wtw=0
lm_only=0
pretty_file=
filter_command=
limit_vocab=0
vocab_aliases=
fast_rescore=
ngram_tool=ngram
ngram_options=
count_oovs=0
rescore_option=-rescore
multichar=_
tmpdir=${TMPDIR-/tmp}
while [ $# -gt 0 ]
do
case "$1" in
-bytelog)
bytelog=1
;;
-nodecipherlm)
nodecipherlm=1
;;
-multiwords)
multiwords=1
mw_option=-multiwords
smw_option=-split-multiwords
;;
-multi-char)
multichar="$2"; shift
;;
-norescore)
norescore=1
;;
-lm-only)
lm_only=1
;;
-count-oovs)
count_oovs=1
rescore_option="-debug 1 -ppl"
;;
-pretty)
pretty_file="$2"; shift
;;
-ngram-tool)
ngram_tool="$2"; shift
;;
-filter)
filter_command="$2"; shift
;;
-limit-vocab)
limit_vocab=1
;;
-vocab-aliases)
vocab_aliases="$2"; shift
;;
-fast)
fast_rescore=1
;;
-*) echo "$0: unknown option $1" >&2
exit 2 ;;
*) break
;;
esac
shift
done
if [ $# -lt 3 ]; then
{
echo "usage: $0 [-bytelog] [-nodecipherlm] [-multiwords] [-multi-char C] [-norescore] [-lm-only] [-count-oovs] [-pretty map] [-ngram-tool pgm] [-filter command] [-limit-vocab] [-vocab-aliases map] [-fast] nbest-file-list score-dir lm-options ..." >&2
echo "where"
echo " -bytelog produces bytelog scaled scores"
echo " -nodecipherlm avoids Decipher LM score computation"
echo " -multiwords expand multiwords into constituent words"
echo " -multi-char C redefine multiword separator character"
echo " -norescore don't rescore LM, just extract scores"
echo " -lm-only output no N-best lists, only LM scores"
echo " -count-oovs output number of OOV and zeroprob words"
echo " -pretty map word mapping file"
echo " -ngram-tool pgm use pgm for LM evaluation"
echo " -filter command text filter to apply to N-best hyps"
echo " -limit-vocab limit LM loading to used vocabulary"
echo " -vocab-aliases map map vocabulary in LM evaluation"
echo " -fast fast rescoring mode, no text filtering allowed"
} >&2
exit 1
fi
filelist="$1"
scoredir="$2"
shift; shift
if [ ! -d $scoredir ]; then
mkdir $scoredir
fi
# when not rescoring need to get decipher lmw and wtw from remaining options
if [ $norescore -gt 0 ]; then
while [ $# -gt 0 ]
do
case "$1" in
-decipher-lmw)
decipher_lmw=$2
shift
;;
-decipher-wtw)
decipher_wtw=$2
shift
;;
*) shift
;;
esac
done
fi
if [ $norescore -eq 0 -a $limit_vocab -gt 0 ]; then
#
# limit LM vocabulary to words found in the nbest lists
#
nbestvocab="$tmpdir/$$nbest.vocab"
trap "rm -f $nbestvocab; exit" 0 1 2 15
# generate nbest vocabulary
if [ -z "$filter_command" ]; then
nbest-lattice -no-rescore -no-reorder \
$mw_option -multi-char "$multichar" \
-nbest-files "$filelist" -write-vocab $nbestvocab
else
cat "$filelist" | xargs gzip -dcf | \
eval "$filter_command" | \
ngram -rescore - -null -no-reorder \
$smw_options -multi-char "$multichar" \
-write-vocab $nbestvocab >/dev/null
fi
# tell ngram to use this vocab
ngram_options="-limit-vocab -vocab $nbestvocab"
fi
if [ $norescore -eq 0 -a -n "$vocab_aliases" ]; then
if [ $limit_vocab -gt 0 ]; then
nbestvocabalias="$tmpdir/$$nbest.vocabalias"
trap "rm -f $nbestvocab $nbestvocabalias; exit" 0 1 2 15
sort -k 2,2 $vocab_aliases | \
join -1 2 -o 1.1,1.2 - $nbestvocab > $nbestvocabalias
# tell ngram to use these vocab-aliases
ngram_options="$ngram_options -vocab-aliases $nbestvocabalias"
else
# tell ngram to use this vocab-alias
ngram_options="-vocab-aliases $vocab_aliases"
fi
fi
if [ -n "$fast_rescore" ]; then
#
# Fast rescoring mode:
# Hand N-best lists directly to ngram. No text filtering is supported
#
if [ -n "$pretty_file" -o -n "$filter_command" -o $lm_only -gt 0 -o $count_oovs -gt 0 ]
then
echo "Text filtering, -lm-only, and -count-oovs not supported with -fast" >&2
exit 2
fi
if [ $nodecipherlm -eq 0 ]; then
echo "Must use -nodecipherlm with -fast" >&2
exit 2
fi
if [ $norescore -gt 0 ]; then
nbest-lattice -no-rescore -no-reorder $mw_option \
-nbest-files "$filelist" \
-write-nbest-dir "$scoredir"
else
if [ "$multiwords" -gt 0 ]; then
mw_option=-split-multiwords
fi
$ngram_tool \
-no-reorder $mw_option -multi-char "$multichar" \
-nbest-files "$filelist" \
-write-nbest-dir "$scoredir" \
-rescore-lmw 1 -rescore-wtw 1 \
$ngram_options "$@"
fi
else # fast_rescore
#
# General rescoring mode:
# Concatenate hyps for all nbest list, record number of hyps for
# each file in the output stream
# Feed to ngram -rescore (using lm-options)
# or using -ppl for counting OOVs
# Parse ngram output into lm scores and deposit into target files
#
escape="***FILE:"
cat $filelist | ( \
while read filename rest; do
case $filename in
# preserve LMstate labels in the file list and pass them to ngram
"<LMstate>") echo $filename $rest
continue ;;
esac
gzip -dcf $filename | \
${GAWK-gawk} '
BEGIN {
filename = "";
numhyps = 0;
nbestformat = 0;
# constants
bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;
pause = "-pau-";
}
function bytelog2log10(x) {
return x / bytelogscale;
}
NR == 1 {
sentid = filename;
sub("^.*/", "", sentid);
sub("\\.gz$", "", sentid);
sub("\\.Z$", "", sentid);
sub("\\.score$", "", sentid);
sub("\\.wv$", "", sentid);
sub("\\.wav$", "", sentid);
sub("\\.wav_cep$", "", sentid);
# read pretty map file
if (pretty_file) {
while ((getline mapline < pretty_file) > 0) {
npretty = split(mapline, pretty_list);
word = pretty_list[1];
pretty_map[word] = "";
for (i = 2; i <= npretty; i ++) {
pretty_map[word] = pretty_map[word] " " pretty_list[i];
}
}
}
print escape, sentid;
}
function pretty_up(start) {
for (i = start; i <= NF; i ++) {
if ($i in pretty_map) {
$i = pretty_map[$i];
}
if (multiwords) gsub(multichar, " ", $i);
}
}
/^NBestList1\.0/ {
nbestformat = 1;
if (nodecipherlm) {
printf "%s: -nodecipherlm ineffective for NBestList1.0\n", filename > "/dev/stderr" ;
}
next;
}
/^NBestList2\.0/ {
nbestformat = 2;
next;
}
{
numhyps ++;
if (nbestformat == 0) {
pretty_up(4);
if (count_oovs) {
# output only the words, add <s> to handle empty hyps
$1 = $2 = $3 = "";
print "<s>", $0;
} else {
print;
}
} else if (nbestformat == 1) {
pretty_up(2);
if (count_oovs) {
# output only the words, add <s> to handle empty hyps
$1 = "";
print "<s>", $0;
} else if (norescore) {
# convert to SRILM format
score = substr($1,2,length($1)-2);
$1 = "";
print bytelog2log10(score), 0, 0, $0;
} else {
# keep Decipher format
print;
}
} else if (nbestformat == 2) {
score = substr($1,2,length($1)-2);
# compute total AC and LM scores
lm_score = 0;
num_words = 0;
num_pauses = 0;
words = "";
prev_end_time = -1;
for (i = 2; i <= NF; i += 11) {
start_time = $(i + 3);
end_time = $(i + 5);
# skip tokens that are subsumed by the previous word
# (this eliminates phone and state symbols)
# XXX: due to a bug in Decipher some state tags have incorrect
# timemarks. We filter them based on their token string.
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
words = words " " $i;
num_words ++;
if ($i == pause) num_pauses ++;
lm_score += $(i + 7);
prev_end_time = end_time;
}
}
$0 = $1 " " words;
pretty_up(2);
# Compute AC score from total and lm scores. This takes into
# account that the recognizer might sum scores of equivalent hyps
# (e.g., those differing only in pauses or pronunciations) and
# reflect the summing in the total score, but not in the word AC
# scores.
ac_score = score - lm_score;
if (count_oovs) {
# output only the words, add <s> to handle empty hyps
$1 = "";
print "<s>", $0;
} else if (norescore) {
# convert to SRILM nbest format
# NOTES:
# - subtract Decipher WTW (including for pauses!)
# - compute number of words WITHOUT pauses for output
$1 = "";
print bytelog2log10(ac_score), \
bytelog2log10(lm_score/decipher_lmw) - \
numwords * decipher_wtw, \
split(words, dummy) - num_pauses, $0;
} else if (nodecipherlm) {
# output only acoustic score in Decipher format
$1 = "(" ac_score ")";
print;
} else {
# output combined score in Decipher format
print;
}
}
}
END {
if (numhyps == 0) {
print "WARNING: nbest list " filename " is empty" \
> "/dev/stderr" ;
}
}
' filename=$filename escape="$escape" count_oovs=$count_oovs \
nodecipherlm=$nodecipherlm multiwords=$multiwords \
multichar="$multichar" pretty_file="$pretty_file" \
norescore=$norescore decipher_lmw=$decipher_lmw decipher_wtw=$decipher_wtw
done
) | \
if [ $norescore -gt 0 -a -z "$filter_command" ]; then
# no rescoring and no filtering
cat
elif [ $norescore -gt 0 ]; then
# no resoring, but filter hyps
eval "$filter_command"
elif [ -z "$filter_command" ]; then
# standard rescoring without filtering
$ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \
-escape "$escape " $ngram_options "$@"
else
# rescoring with filtering
eval "$filter_command" | \
$ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \
-escape "$escape " $ngram_options "$@"
fi | \
${GAWK-gawk} -v bytelog=$bytelog '
BEGIN {
currentfile = "";
scoredir = "";
scorefile = "";
numhyps = 0;
bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;
}
$1 == escape {
if (currentfile) {
close(scorefile);
}
currentfile = $2;
sub("
$", "", currentfile);
if (!lm_only && !count_oovs) {
# backward compatibility
currentfile = currentfile ".score";
}
scorefile = "gzip > " scoredir "/" currentfile ".gz";
printf "processing hyps for %s\n", currentfile \
> "/dev/stderr" ;
hypno = 0;
next;
}
# parse ngram -ppl output to get OOV (including zeroprobs) count
count_oovs && $6 == "OOVs" {
num_oovs = $5;
next;
}
count_oovs && $2 == "zeroprobs," {
num_oovs += $1;
print num_oovs | scorefile;
next;
}
# process ngram -rescore output
!count_oovs {
if ($2 ~ /NaN/) {
print "WARNING: LM score in nbest list " currentfile " is NaN" \
> "/dev/stderr" ;
$2 = -100000;
}
if (bytelog) {
$1 = $1 * bytelogscale;
$2 = $2 * bytelogscale;
}
if (lm_only) {
print $2 | scorefile;
} else {
print | scorefile;
}
}
END {
if (currentfile) {
close(scorefile);
}
}
' scoredir=$scoredir escape="$escape" bytelog=$bytelog lm_only=$lm_only count_oovs=$count_oovs

View File

@@ -0,0 +1,43 @@
#!/bin/sh
#
# rescore-minimize-wer --
# minimize posterior expected WER in an nbest-list
#
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-minimize-wer,v 1.7 2013/03/09 07:13:01 stolcke Exp $
#
if [ $# -lt 1 ]; then
echo "usage: $0: score-dir [lmw [wtw [max-nbest]]]" >&2
exit 1
fi
scoredir="$1"
lmweight="${2-8.0}"
wtweight="${3-0.0}"
maxnbest="${4-10}"
find $scoredir -follow -type f \( -name \*.score -o \
-name \*.score.Z -o \
-name \*.score.gz \) \
-print | sort | \
while read file
do
case $file in
*.Z) cat="gzip -dcf"
sentid=`basename $file .score.Z`
;;
*.gz) cat="gzip -dcf"
sentid=`basename $file .score.gz`
;;
*) cat=cat
sentid=`basename $file .score`
;;
esac
${GAWK-gawk} -v sentid="$sentid" 'BEGIN { printf "%s ", sentid }'
$cat $file | \
sed -e 's,-pau-,,g' -e 's,\[[^]]*\],,g' | \
nbest-lattice -wer -debug 1 -rescore - \
-rescore-lmw $lmweight -rescore-wtw $wtweight \
-max-rescore $maxnbest
done

View File

@@ -0,0 +1,77 @@
#!/bin/sh
#
# rescore-nbest --
# output LM scores for nbest lists
#
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-nbest,v 1.3 1996/03/28 19:12:01 stolcke Exp $
#
if [ $# -lt 3 ]; then
echo "usage: $0: nbest-file-list score-dir lm-options ..." >&2
exit 1
fi
filelist="$1"
scoredir="$2"
shift; shift
#
# STRATEGY:
# Concatenate hyps for all nbest list, record number of hyps for
# each file in the output stream
# Strip hyp ids, !SENT_START, !SENT_END
# Feed to ngram -ppl (using lm-options)
# Parse ngram output into lm scores and deposit into target files
#
escape="***FILE:"
cat $filelist | ( \
while read filename; do
set -e
numhyps=`wc -l < $filename`
echo "$escape `basename $filename .trans`.score $numhyps"
sed \
-e 's/^ *([^ ]*) //' \
-e 's/!SENT_START //' \
-e 's/!SENT_END //' \
$filename
done
) | \
ngram -debug 1 -ppl - -escape "$escape " "$@" | \
gawk '
BEGIN {
currentfile = "";
scoredir = "";
scorefile = "";
numhyps = 0;
M_LN10 = 2.30258509299404568402; # from <math.h>
}
$1 == escape {
if (currentfile) {
close(scorefile);
}
currentfile = $2;
scorefile = scoredir "/" currentfile;
numhyps = $3;
printf "processing %d hyps for %s\n", numhyps, currentfile;
hypno = 0;
next;
}
/logprob=/ {
logprob = $4;
hypno ++;
# rescale LM scores to natural logs
printf "%g\n", logprob * M_LN10 > scorefile;
next;
}
END {
if (currentfile) {
close(scorefile);
}
}
' scoredir=$scoredir escape="$escape"

View File

@@ -0,0 +1,134 @@
#!/bin/sh
#
# rescore-reweight
# reweight nbest-list scores and select top hyps
#
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-reweight,v 1.20 2013/03/09 07:13:01 stolcke Exp $
#
multiwords=0
multichar=_
while [ $# -gt 0 ]
do
case "$1" in
-multiwords)
multiwords=1
;;
-multi-char)
multichar="$2"
shift
;;
-*) echo "$0: unknown option $1" >&2
exit 2 ;;
*) break
;;
esac
shift
done
if [ $# -lt 1 ]; then
echo "usage: $0 [-multiwords] [-multi-char C] score-dir [lmw [wtw [scoredir weight ...] [max-nbest]]]" >&2
echo " or $0 [-multiwords] [-multi-char C] file-list [lmw [wtw [scoredir weight ...] [max-nbest]]]" >&2
exit 1
fi
scoredir="$1"
shift
lmweight="${1-8.0}"
[ $# -gt 0 ] && shift
wtweight="${1-0.0}"
[ $# -gt 0 ] && shift
extra_scoredirs=
extra_weights=
while [ $# -gt 1 ]; do
extra_scoredirs="$extra_scoredirs $1"
extra_weights="$extra_weights $2"
shift; shift
done
maxnbest="${1-100000}"
# prevent "broken pipe" from $cat below when maxnbest truncates list
trap '' 13
if [ -d $scoredir ]; then
find $scoredir -follow -type f \( -name \*.score -o \
-name \*.score.Z -o \
-name \*.gz \) \
-print | sort
else
cat $scoredir
fi | \
while read file
do
case $file in
*.score.Z) cat="gzip -dcf"
sentid=`basename $file .score.Z`
;;
*.score.gz) cat="gzip -dcf"
sentid=`basename $file .score.gz`
;;
*.score) cat=cat
sentid=`basename $file .score`
;;
*) # use nbest-lattice to convert Decipher nbest format
cat="nbest-lattice -no-rescore -no-reorder -keep-noise -write-nbest - -nbest"
sentid=`basename $file .gz`
;;
esac
if [ -z "$extra_scoredirs" ]; then
$cat $file
else
extra_scores=
for dir in $extra_scoredirs
do
if [ -f $dir/$sentid.gz ]; then
extra_scores="$extra_scores $dir/$sentid.gz"
elif [ -f $dir/$sentid ]; then
extra_scores="$extra_scores $dir/$sentid"
else
echo "$dir/$sentid" is missing >&2
extra_scores="$extra_scores /dev/null"
fi
done
$cat $file | \
combine-acoustic-scores \
-v "weights=1 $extra_weights" \
-v max_nbest=$maxnbest \
- $extra_scores
fi | \
${GAWK-gawk} '
BEGIN {
hypnum = 0;
}
NF >= 3 {
hypnum ++;
if (hypnum > maxnbest) exit 0;
totalscore = $1 + lmweight * $2 + wtweight * $3;
if (!winner || totalscore > maxscore) {
maxscore = totalscore;
winner = $0;
winrank = hypnum;
besthyp = "";
for (i = 4; i <= NF; i++) besthyp = besthyp " " $i;
}
}
END {
# resolve multiwords if requested
if (multiwords) {
gsub(multichar, " ", besthyp);
}
print sentid besthyp;
printf "%s: best hyp is %d\n", sentid, winrank > "/dev/stderr";
}
' sentid="$sentid" lmweight="$lmweight" wtweight="$wtweight" maxnbest="$maxnbest" multiwords=$multiwords multichar="$multichar"
done

View File

@@ -0,0 +1,85 @@
#!/usr/local/bin/gawk -f
#
# reverse-lm --
# reverse N-grams in a backoff LM
#
# usage: reverse-lm lm-file > rev-lm-file
#
# $Header: /home/srilm/CVS/srilm/utils/src/reverse-lm.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
#
BEGIN {
start_tag = "<s>";
end_tag = "</s>";
renorm_command = "ngram -debug 1 -order 2 -lm - -renorm -write-lm -";
}
NF==0 {
print | renorm_command;
next;
}
/^ngram *[0-9][0-9]*=/ {
order = substr($2,1,index($2,"=")-1);
if (order > 2) {
print "can handle bigram LMs only" >> "/dev/stderr";
exit(2);
}
print | renorm_command;
next;
}
/^\\[0-9]-grams:/ {
currorder=substr($0,2,1);
print | renorm_command;
next;
}
/^\\/ {
print | renorm_command;
next;
}
currorder == 1 {
# unigrams are copied unchanged
# store probs for later use
prob = $1;
word = $2;
if (word == start_tag) {
; # get <s> unigram prob from </s>
} else if (word == end_tag) {
uniprob[start_tag] = uniprob[end_tag] = prob;
} else {
uniprob[word] = prob;
}
# add dummy backoff weight
$3 = "0";
print | renorm_command;
next;
}
function map_tags(w) {
if (w == start_tag) {
return end_tag;
} else if (w == end_tag) {
return start_tag;
} else {
return w;
}
}
currorder == 2 {
# bigrams are reverse and new probabilities are assigned
prob = $1;
w1 = map_tags($2);
w2 = map_tags($3);
# p_rev(w1|w2) = p(w1) p(w2|w1) / p(w2)
new_prob = uniprob[w1] + prob - uniprob[w2];
if (new_prob > 0) {
print "warning: p(" w1 "|" w2 ") > 0" >> "/dev/stderr";
}
print new_prob "\t" w2 " " w1 | renorm_command;
next;
}

View File

@@ -0,0 +1,28 @@
#!/usr/local/bin/gawk -f
#
# reverse-ngram-counts --
# Reverse the word order in N-gram count files
#
# $Header: /home/srilm/CVS/srilm/utils/src/reverse-ngram-counts.gawk,v 1.2 2017/07/31 18:18:50 stolcke Exp $
#
BEGIN {
start_tag = "<s>";
end_tag = "</s>";
}
{
i = 1;
j = NF - 1;
while (i < j) {
h = $i;
$i = $j;
$j = h;
i ++; j--;
}
# swap <s> and </s> tags
for (i = 1; i < NF; i ++) {
if ($i == end_tag) $i = start_tag;
else if ($i == start_tag) $i = end_tag;
}
print;
}

View File

@@ -0,0 +1,32 @@
#!/usr/local/bin/gawk -f
#
# reverse-text --
# Reverse the word order in a text file
#
# $Header: /home/srilm/CVS/srilm/utils/src/reverse-text.gawk,v 1.1 2003/01/01 18:35:23 stolcke Exp $
#
BEGIN {
start_tag = "<s>";
end_tag = "</s>";
}
{
if ($1 == start_tag) {
i = 2;
} else {
i = 1;
}
if ($NF == end_tag) {
j = NF - 1;
} else {
j = NF;
}
while (i < j) {
h = $i;
$i = $j;
$j = h;
i ++; j--;
}
print;
}

View File

@@ -0,0 +1,176 @@
#!/bin/sh
#
# rexport --
# retrying export with customs, via gnumake
#
# $Header: /home/srilm/CVS/srilm/utils/src/rexport.gnumake,v 1.2 2011/07/21 19:48:19 stolcke Exp $
#
usage() {
echo "usage: $0 [-m] [-J numjobs] [-delay D] [-check-exec] [-f] [-debug] [-same] [-exclusive] [-exit-on-error] [-uselocal] [-attr value] ... command [args ...]" >&2
}
# allow as many file descriptors as possible for pmake
# (this command may fail in old versions of sh -- we ignore that)
ulimit -n `ulimit -H -n 2>/dev/null` >/dev/null 2>&1
set -e
jobs=1
makemode=0
delay=
check_exec=0
exit_on_error=0
#
# parse options
#
attributes=
while [ $# -gt 0 ]; do
case "$1" in
-m) makemode=1
shift ;;
-same) attributes="$attributes SAME"
shift ;;
-exclusive)
attributes="$attributes EXCLUSIVE"
shift ;;
-uselocal)
attributes="$attributes USELOCAL"
shift ;;
-attr) attributes="$attributes $2"
shift; shift;;
-debug) debug=1
shift ;;
-f) readfiles=1;
shift ;;
-J) jobs="$2"
shift; shift ;;
-delay) delay="$2"
shift; shift ;;
-check-exec)
check_exec=1
shift ;;
-exit-on-error)
exit_on_error=1
shift ;;
-*) usage
exit 2 ;;
*)
break ;;
esac
done
#
# parse command
#
# find tmp file that doesn't exist yet
for suffix in a b c d e f g h i j k l m n o p q r s t u v x y z
do
mkfile=/tmp/export$$$suffix
if [ ! -f $mkfile ]; then
break
fi
done
trap "rm -f $mkfile; exit 1" 1 2 15
#
# create makefile
#
if [ "$#" -eq 0 -o "$readfiles" ]; then
# read commands from files or stdin
cat "$@"
else
# use what's on the command line
echo "$@"
fi | \
gawk '
BEGIN {
ld_lib_path_var = "LD_LIBRARY_PATH";
}
NR == 1 {
# always use /bin/sh for portability across platforms
print "SHELL=/bin/sh"
print ".cleanup: ; @/bin/rm -f " mkfile
jobnum = 0;
}
NF > 0 {
jobnum ++;
job = ".job" jobnum;
alljobs = alljobs job " ";
# make sure shell variable expansion is preserved
gsub("\\$", "$$");
delay = delay + 0;
if (check_exec) {
exec_file = "";
for (i = 1; i <= NF; i ++) {
if ( $i ~ "^/") {
exec_file = $i;
break;
}
}
if (exec_file) {
sub("[;&|].*", "", exec_file);
$0 = "while [ ! -x " exec_file " ]; do sleep 5; done; " $0
}
}
if (ld_lib_path_var in ENVIRON) {
$0 = ld_lib_path_var "=" ENVIRON[ld_lib_path_var] "; export " ld_lib_path_var "; " $0
}
if (njobs > 1) {
if (delay > 0 && jobnum > 1) {
prev_delay_target = delay_target;
delay_target = "delay" jobnum;
print delay_target ": " prev_delay_target \
"; @sleep " delay;
} else {
delay_target = "";
}
print job ": " delay_target "; " $0;
} else {
print job ": ; @" $0;
}
if (makemode) {
print "\t@touch " job;
}
}
END {
print "all: " alljobs;
print alljobs ": .cleanup";
if (jobnum == 0) {
print "warning: empty command list" > "/dev/stderr";
}
}
' makemode=$makemode attributes="$attributes" mkfile="$mkfile" \
njobs="$jobs" delay="$delay" check_exec=$check_exec \
exit_on_error=$exit_on_error > $mkfile
if [ "$debug" ]; then
cat $mkfile
rm -f $mkfile
exit
fi
# avoid illegal values when make is invoked from other makes
MAKEFLAGS=
MFLAGS=
export MAKEFLAGS MFLAGS
if [ $exit_on_error = 0 ]; then
ignoreflag=-k
fi
exec make -j $jobs $ignoreflag -f $mkfile all

View File

@@ -0,0 +1,35 @@
#!/usr/local/bin/gawk -f
#
# rover-control-tying --
# extract tying information from rover-control file for use with
# compute-best-rover-mix tying=...
#
BEGIN {
bin = 0;
}
/^##/ || /^[ ]*$/ {
# skip comment or empty line
next;
}
$3 == "+" {
next;
}
{
if ($4 == "") $4 = 1;
if ($4 == "=") {
output = output " " bin;
} else {
output = output " " ++bin;
}
}
END {
sub("^ ", "", output);
print output;
}

View File

@@ -0,0 +1,65 @@
#!/usr/local/bin/gawk -f
#
# rover-control-weights --
# retrieve or change weights in rover-control file
#
# usage:
# retrieving
# rover-control-weights rover-control
# changing:
# rover-control-weights weights="..." rover-control > new-rover-control
#
# $Header: /home/srilm/CVS/srilm/utils/src/rover-control-weights.gawk,v 1.3 2017/08/16 06:34:16 stolcke Exp $
#
NR == 1 {
if (weights) {
nweights = split(weights, w);
}
output_weights = "";
}
/^##/ || /^[ ]*$/ {
# pass through comment or empty line
print;
next;
}
$3 == "+" {
if (weights) {
print;
}
next;
}
{
# dir lmw wtw weight max_nbest scale
if (weights) {
# fill in missing parameter values
if (NF < 2) $2 = 8;
if (NF < 3) $3 = 0;
if (++ sysno <= nweights) {
if ($4 == "=" && w[sysno] == w[sysno-1]) {
# preserve weight tying if new weights are compatible
;
} else {
$4 = w[sysno];
}
} else {
$4 = 1;
}
print;
} else {
if (NF < 4) $4 = 1;
output_weights = output_weights " " $4;
}
}
END {
if (!weights) {
sub("^ ", "", output_weights);
print output_weights;
}
}

View File

@@ -0,0 +1,267 @@
#!/bin/sh
#
# search-rover-combo --
# search for best rover combination from a list of systems
#
# $Header: /home/srilm/CVS/srilm/utils/src/search-rover-combo,v 1.14 2016-12-10 18:20:33 stolcke Exp $
#
scriptdir=`dirname $0`
score_script=$scriptdir/score-hyps
datadir=SEARCH-DATA
weights="1"
smooth_weight=
sentids=-
njobs=1
refs=
# collect options
while [ $# -gt 0 ]; do
case "$1" in
-rover) shift
run_rover=1
break ;;
-rover-optimize) shift
run_rover_optimize=1
break ;;
-scorer) score_script="$2";
shift; shift ;;
-weights) weights="$2";
shift; shift ;;
-smooth-weight)
smooth_weight="$2";
shift; shift ;;
-smooth-control)
smooth_control="$2";
shift; shift ;;
-datadir) datadir="$2";
shift; shift ;;
-sentids) sentids="$2";
shift; shift ;;
-refs) refs="$2"
shift; shift ;;
-J) njobs=$2
shift; shift ;;
-*) echo "usage: $0 [-scorer SCRIPT] [-weights=\"W1 W2 ...\" | -refs REFS] [-smooth-weight S] [-datadir DIR] [-sentids LIST] LIST-OF-CONTROL-FILES" >&2
exit 2 ;;
*) break ;;
esac
done
# see if this is a recursive evaluation to run a single nbest-rover
if [ -n "$run_rover" ]; then
# sentids control-file hyps-out
nbest-rover $1 $2 > $3
exit
elif [ -n "$run_rover_optimize" ]; then
# sentids control-file hyps-out refs
nbest-rover $1 $2 /dev/null > $3-0 2>&1 \
-refs $4 -write-ref-posteriors $3.ref-posteriors
rm $3-0
tying=`rover-control-tying $2`
compute-best-rover-mix tying="$tying" $3.ref-posteriors > $3.optimize 2>&1
weights=`${GAWK-gawk} '/best lambda/ { sub(".*[(]", "", $0); sub("[)]", "", $0); print }' $3.optimize `
rover-control-weights weights="$weights" $2 > $2.optimized1
if [ -n "$smooth_weight" -a -n "$smooth_control" ]; then
combine-rover-controls keeppaths=1 lambda=$smooth_weight $smooth_control $2.optimized1 > $2.optimized
else
mv $2.optimized1 $2.optimized
fi
nbest-rover $1 $2.optimized > $3
exit
fi
rexport=${REXPORT-rexport.gnumake -exit-on-error -J $njobs -f}
input_list=${1-SYSTEM-LIST}
# backward compatibility for 2nd argument
score_script=${2-$score_script}
# backward compatibility for 3rd argument
datadir=${3-$datadir}
set -e
mkdir -p $datadir
#
# Step 1: compute errors for individual systems
#
system_errors=$datadir/system-errors
cmdlist=$datadir/score.rexports
tmpctrl=$datadir/tmp.control
tmphyps=$datadir/tmp.hyps
tmpscore=$datadir/tmp.score
sort $input_list > $datadir/sorted_inputs
iter=0
iterdir=$datadir/$iter
mkdir -p $iterdir
system_errors=$iterdir/system_errors
if [ ! -s $system_errors ]; then
count=1
> $cmdlist
cat $datadir/sorted_inputs | \
while read roverctrl
do
# rewrite rover control file to adjust directory paths
combine-rover-controls $roverctrl > $tmpctrl.$count
echo "$0 -rover $sentids $tmpctrl.$count $tmphyps.$count; \
echo $roverctrl \`$score_script $tmphyps.$count\` > $tmpscore.$count" >> $cmdlist
count=`expr $count + 1`
done
# run the scoring jobs
if [ $njobs -lt 2 ]; then
sh -ex $cmdlist >$cmdlist.log 2>&1
else
$rexport $cmdlist >$cmdlist.log 2>&1
fi
sort +0 -1 $tmpscore.* > $system_errors
rm -f $tmpctrl.* $tmphyps.* $tmpscore.*
fi # system_errors exists
best_system=`sort +1n -2 $system_errors | ${GAWK-gawk} '{ print $1; exit }' `
best_error=`sort +1n -2 $system_errors | ${GAWK-gawk} '{ print $2; exit }' `
echo "FIRST SYSTEM" >&2
echo $best_system >&2
echo "ERROR $best_error" >&2
echo "$best_system 1" > $iterdir/combo
join -v 1 $datadir/sorted_inputs $iterdir/combo > $iterdir/unused
cat $best_system > $iterdir/rover.control
tryall=yes
# if weigh testimation is used it we always add the new system at a fixed lower weight
# than the sum of prior systems
if [ -n "$refs" ]; then
weights=0.5
fi
while [ -s $iterdir/unused ]
do
newiter=`expr $iter + 1`
newiterdir=$datadir/$newiter
mkdir -p $newiterdir
echo "ITER $newiter" >&2
system_errors=$newiterdir/system_errors
if [ ! -s $system_errors ]; then
for weight in $weights
do
count=1
> $cmdlist
cat $iterdir/unused | \
while read roverctrl
do
combine-rover-controls keeppaths=1 lambda="1 $weight" $iterdir/rover.control $roverctrl > $tmpctrl.$count
if [ -n "$refs" ]; then
# evaluate rover control file with weight optimization
if [ -n "$smooth_weight" ]; then
smooth="-smooth-weight $smooth_weight -smooth-control $iterdir/rover.control"
fi
echo "$0 $smooth -rover-optimize $sentids $tmpctrl.$count $tmphyps.$count $refs; \
echo $roverctrl $weight \`$score_script $tmphyps.$count\` $tmpctrl.$count.optimized > $tmpscore.$count" >> $cmdlist
else
# evaluate rover control file without weight optimization
echo "$0 -rover $sentids $tmpctrl.$count $tmphyps.$count; \
echo $roverctrl $weight \`$score_script $tmphyps.$count\` $tmpctrl.$count > $tmpscore.$count" >> $cmdlist
fi
count=`expr $count + 1`
done
# run the scoring jobs
if [ $njobs -lt 2 ]; then
sh -ex $cmdlist >$cmdlist.log 2>&1
else
$rexport $cmdlist >$cmdlist.log 2>&1
fi
sort +0 -1 $tmpscore.* > $system_errors
${GAWK-gawk} -v old_error=$best_error '$3 < old_error' $system_errors > $system_errors.improved
if [ -s $system_errors.improved ]; then
# we found at least one improvement; stop trying weights
break;
fi
done
else
# restart search at this iteration
${GAWK-gawk} -v old_error=$best_error '$3 < old_error' $system_errors > $system_errors.improved
fi
if [ -s $system_errors.improved ]; then
best_system=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $1; exit }' `
best_weight=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $2; exit }' `
best_error=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $3; exit }' `
best_control=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $4; exit }' `
echo "NEXT SYSTEM" >&2
echo "$best_system $best_weight" >&2
echo "ERROR $best_error" >&2
if [ ! -s $newiterdir/rover.control ]; then
cat $best_control > $newiterdir/rover.control
fi
{ cat $iterdir/combo; echo "$best_system $best_weight"; } | sort +0 -1 > $newiterdir/combo
${GAWK-gawk} '{ print $1 }' $system_errors.improved | \
join -v 1 - $newiterdir/combo > $newiterdir/unused
tryall=yes
else
cat $iterdir/combo > $newiterdir/combo
cat $iterdir/rover.control > $newiterdir/rover.control
fi
rm -f $tmpctrl.* $tmphyps.* $tmpscore.*
if [ ! -s $newiterdir/unused -a "$tryall" ]; then
# no improvement -- add all previously discarded systems back into the running
echo "EXPANDING SEARCH" >&2
if [ ! -f $newiterdir/combo ]; then
# try extending the same combo again in next iteration
cat $iterdir/combo > $newiterdir/combo
cat $iterdir/rover.control > $newiterdir/rover.control
fi
join -v 1 $datadir/sorted_inputs $newiterdir/combo > $newiterdir/unused
# do this only once until we can add a new system
tryall=
fi
iter=$newiter
iterdir=$newiterdir
done
echo "BEST COMBO" >&2
cat $iterdir/combo >&2
echo "ERROR $best_error" >&2
cat $iterdir/rover.control

View File

@@ -0,0 +1,551 @@
#!/usr/bin/perl
#
# Usage: select-vocab [-quiet] -heldout file f1 f2 ... fn
#
# Selects a vocabulary from the union of the vocabularies of f1
# through fn that maximizes the likelihood of the heldout file. f1
# through fn can either be text files, count files or ARPA-style
# back-off language models. If they are text files, further,
# each line in them can optionally be prefixed by a sentence id, which
# will be stripped if the file has the .sentid extension.
#
# Note: This implementation corrects an error in the paper [1]. The
# EM procedure specification in [1] describes corpus level interpolation.
# But we use word-level interpolation.
#
# Authors: Anand Venkataraman and Wen Wang
# STAR Lab, SRI International, Menlo Park, CA 94025, USA.
#
# $Header: /home/srilm/CVS/srilm/utils/src/select-vocab.pl,v 1.7 2013/04/05 16:50:56 stolcke Exp $
#
# Globals
my $Quiet = 0; # Quiet or Verbose?
my $Gzip = 0; # Do we have Gzip?
MAIN: {
my $heldOut = ""; # Filename of the heldout corpus
my $maxIter = 500; # Perform a maximum of this many EM iters
my $precision = 1e-5; # Stop EM iterations when log likelihood changes less than this much
my $scale = 1e6; # Scale final output counts by this much
while ($arg = shift(@ARGV)) {
if ($arg =~ /^-h(elp)?$/) {
usage();
} elsif ($arg =~ /^-held(-)?(out)?$/) {
$heldOut = shift(@ARGV);
} elsif ($arg =~ /^-scale(-)?(counts)?$/) {
$scale = shift(@ARGV);
} elsif ($arg =~ /^-q(uiet)?$/) {
$Quiet = 1;
} elsif ($arg =~ /^-/) {
print STDERR "Unknown option: $arg\n";
usage();
} else {
unshift(@ARGV, $arg);
last;
}
}
die "$0: I need a held out corpus (-heldout) to maximize likelihood.\n" if ($heldOut eq "");
die "$0: I need at least two corpora to combine vocabulary counts.\n" if ($#ARGV < 1);
# Determine whether gzip exists in the path
#
if (system("sh -c 'gzip -help' >/dev/null 2>&1") == 0) {
message("I found gzip in your path. So I'll support compressed input.\n");
$Gzip=1;
} else {
message("I didn't find gzip in your path. So I won't support compressed input.\n");
$Gzip=0;
}
# Make held-out counts and calculate total number of tokens.
#
my $heldOut_counts_ref = make_raw_counts($heldOut);
my $numWords = 0;
foreach my $word (keys %{$heldOut_counts_ref}) {
$numWords += $heldOut_counts_ref->{$word};
}
die "$0: The held-out corpus must not be empty.\n" if ($numWords == 0);
# The grand vocab is a union of all possible words, including in the Heldout set.
#
my $vocab = make_full_vocab($heldOut, @ARGV);
# Create log distributions for each of the (n > 1) corpora. The counts
# will all use a common vocabulary that is the union of the individual
# vocabularies. Use Witten-Bell discounting to handle zero-frequency
# items in the normalization process.
#
for (my $n = 0; $n <= $#ARGV; $n++) {
$lambda[$n] = 1/($#ARGV+1);
$logprobs_refs[$n] = estimate_logprobs($ARGV[$n], $vocab);
}
message("Iter 0: lambdas = (@lambda)\n");
# Now perform EM. Iterate to increase the likelihood of the heldout set.
# Procedure halts when the likelihood changes by less than $precision
# after an iteration. See Eqns. (3)-(6) of Venkataraman & Wang, 2003.
#
$done = 0;
$iter = 0;
while (!$done && $iter < $maxIter) {
$done = 1;
$iter++;
my $loglike = 0;
@post_totals = ();
# Calculate log lambdas.
#
for (my $n = 0; $n <= $#ARGV; $n++) {
$log_lambda[$n] = log($lambda[$n]);
}
# Estimate lambdas per word and average over all words.
#
foreach my $word (keys %{$heldOut_counts_ref}) {
undef $log_numer_sum;
for (my $n = 0; $n <= $#ARGV; $n++) {
$log_numer[$n] = $log_lambda[$n] + $logprobs_refs[$n]->{$word};
$log_numer_sum = logsum($log_numer_sum, $log_numer[$n]);
}
$loglike += $log_numer_sum * $heldOut_counts_ref->{$word};
for (my $n = 0; $n <= $#ARGV; $n++) {
$post_totals[$n] += exp($log_numer[$n] - $log_numer_sum) * $heldOut_counts_ref->{$word};
}
}
for (my $n = 0; $n <= $#ARGV; $n++) {
$lambda_prime[$n] = $post_totals[$n]/$numWords;
$delta[$n] = abs($lambda_prime[$n] - $lambda[$n]);
$done = 0 if ($delta[$n] > $precision);
}
@lambda = @lambda_prime;
next if $Quiet;
for (my $n = 0; $n <= $#lambda_prime; $n++) {
$lambda_trunc[$n] = sprintf("%0.6f", $lambda[$n]);
}
my $ppl_trunc = sprintf("%.4f", exp(-$loglike/$numWords));
my $loglike_trunc = sprintf("%.4f", $loglike);
message("Iter $iter: lambdas = (@lambda_trunc) log P(held-out) = $loglike_trunc PPL = $ppl_trunc\n");
}
# Compute the combined counts.
#
message("Combining counts.\n");
undef %counts;
foreach my $word (keys %{$vocab}) {
for (my $n = 0; $n <= $#ARGV; $n++) {
$counts{$word} += $lambda[$n] * exp($logprobs_refs[$n]->{$word});
}
}
# Print out the final vocab with the combined counts scaled by $scale.
#
foreach my $word (keys %counts) {
my $score = $counts{$word} * $scale;
print "$word\t $score\n";
}
exit(0);
}
#----------------------------------------------------------------------
# Return a ref to a hash of normalized counts. Use the given vocabulary
# and Witten-Bell (1991) smoothing to ensure non-zero probabilities.
#
sub estimate_logprobs {
my($f, $voc_ref) = @_;
message("Estimating logprobs for $f. ");
my $counts_ref = make_raw_counts($f);
my $sumcounts = 0;
foreach my $word (keys %{$counts_ref}) {
$sumcounts += $counts_ref->{$word};
}
# Compute the number of "novel" words. i.e. words in vocab, but
# not in counts.
#
my $vocabsize = scalar keys %{$voc_ref};
my $nwords = scalar keys %{$counts_ref};
my $num_novel = $vocabsize - $nwords;
message("It has all but $num_novel vocabulary words.\n");
# If there are no novel words, just normalize and return;
#
if (!$num_novel) {
foreach my $word (keys %{$counts_ref}) {
$counts_ref->{$word} = log($counts_ref->{$word}) - log($sumcounts);
}
return $counts_ref;
}
# Create keys for novel words.
#
foreach my $word (keys %{$voc_ref}) {
$counts_ref->{$word} += 0;
}
# If the sum of the counts is less than zero, we probably got them from a
# language model that already smoothed the unigram counts. So we use the left over
# mass for novel words. Otherwise, if the sum is equal to 1, we rescale the
# probabilities by 0.9 (until a better way can be found), and use the remaining
# mass to distribute. If the counts are > 1, then we perform smoothing ourselves.
#
if ($sumcounts < 1) {
my $novel_mass = 1-$sumcounts;
message("\tSum of counts in $f is only $sumcounts\n");
message("\tWill distribute probabilty mass of $novel_mass over novel words\n");
my $novel_logprob = log(1-$sumcounts) - log($num_novel);
foreach my $word (keys %{$counts_ref}) {
if ($counts_ref->{$word}) {
$counts_ref->{$word} = log($counts_ref->{$word});
} else {
$counts_ref->{$word} = $novel_logprob;
}
}
return $counts_ref;
}
if ($sumcounts == 1) {
message("\tSum of counts in $f is exactly 1\n");
message("\tWill scale them by 0.9 and use 0.1 for novel words.\n");
my $novel_logprob = log(0.1/$num_novel);
foreach my $word (keys %{$counts_ref}) {
if ($counts_ref->{$word}) {
$counts_ref->{$word} = log($counts_ref->{$word} * 0.9);
} else {
$counts_ref->{$word} = $novel_logprob;
}
}
return $counts_ref;
}
# Normalize and smooth. Note that in calculating the probability of novel words,
# the Witten-Bell estimate for the novel event is $nwords/($sum_counts+$nwords).
# This mass is shared equally by each of the novel words and hence $num_novel in
# the denominator.
#
foreach my $word (keys %{$counts_ref}) {
if ($counts_ref->{$word}) {
$counts_ref->{$word} = log($counts_ref->{$word}/($sumcounts + $nwords));
} else {
$counts_ref->{$word} = log($nwords) - log($sumcounts + $nwords) - log($num_novel);
}
}
return $counts_ref;
}
#---------------------------------------------------------------------------
# The following subroutines construct the vocabulary from various kinds
# of input files.
#
sub make_full_vocab {
my @files = @_;
my %voc;
foreach my $f (@files) {
$ftype = getftype($f);
if ($ftype eq "text") {
message("Adding words from text file $f into vocabulary.\n");
add_vocab_from_text(\%voc, $f);
} elsif ($ftype eq "sentid") {
message("Adding words from sentID file $f into vocabulary.\n");
add_vocab_from_sentid(\%voc, $f);
} elsif ($ftype eq "counts") {
message("Adding words from counts file $f into vocabulary.\n");
add_vocab_from_counts(\%voc, $f);
} elsif ($ftype eq "arpa-lm") {
message("Adding words from ARPA-style LM file $f into vocabulary.\n");
add_vocab_from_lm(\%voc, $f);
} else {
die "I don't know the file type for $f. Giving up.\n";
}
}
return \%voc;
}
sub add_vocab_from_text {
my($voc_ref, $f) = @_;
my $in = zopen($f);
while (my $line = <$in>) {
my @words = split(/\s+/, $line);
foreach my $word (@words) {
$voc_ref->{$word} = 0;
}
}
close($in);
}
# Same as above, but gets rid of sentid (first word on each line)
#
sub add_vocab_from_sentid {
my($voc_ref, $f) = @_;
my $in = zopen($f);
while (my $line = <$in>) {
my @words = split(/\s+/, $line);
shift(@words); # Toss sentid
foreach my $word (@words) {
$voc_ref->{$word} = 0;
}
}
close($in);
}
# Same as above, but only uses the first word of each line. Each line
# in a count file will have two fields -- word count
#
sub add_vocab_from_counts {
my($voc_ref, $f) = @_;
my $in = zopen($f);
while (my $line = <$in>) {
my @fields = split(/\s+/, $line);
next if $line =~ /^\s*$/ || $#fields > 1; # Ignore non-unigram counts
next if $fields[0] =~ /<.*>/; # Skip pseudo words.
$voc_ref->{$fields[0]} = 0;
}
close($in);
}
# Same as above, but only takes probabilities from the unigram
# portion of the arpa-format lm.
#
sub add_vocab_from_lm {
my($voc_ref, $f) = @_;
my $in = zopen($f);
# Locate unigram section
while (my $line = <$in>) {
last if $line =~ /^\\1-grams:/;
}
# Read unigrams into vocab
while (my $line = <$in>) {
last if /^\\2-grams:/;
my ($logprob, $word, @rest) = split(/\s+/, $line);
next if $word =~ /(^\s*$)|(<.*>)/; # Skip pseudo words.
$voc_ref->{$word} = 0;
}
close($in);
}
#----------------------------------------------------------------------
# The following subroutines are very similar to the ones above.
# They return a ref to a hash of unnormalized counts from various kinds
# of input files.
#
sub make_raw_counts {
my($f) = @_;
$ftype = getftype($f);
if ($ftype eq "text") {
return make_raw_counts_from_text($f);
} elsif ($ftype eq "sentid") {
return make_raw_counts_from_sentid($f);
} elsif ($ftype eq "counts") {
return make_raw_counts_from_counts($f);
} elsif ($ftype eq "arpa-lm") {
return make_raw_counts_from_lm($f);
} else {
die "I don't know the file type for $f. Giving up.\n";
}
}
sub make_raw_counts_from_text {
my($f) = @_;
my %counts;
my $in = zopen($f);
while (my $line = <$in>) {
my @words = split(/\s+/, $line);
foreach my $word (@words) {
$counts{$word}++;
}
}
close($in);
return \%counts;
}
sub make_raw_counts_from_sentid {
my($f) = @_;
my %counts;
my $in = zopen($f);
while (my $line = <$in>) {
my @words = split(/\s+/, $line);
shift (@words); # Toss sentid
foreach my $word (@words) {
$counts{$word}++;
}
}
close($in);
return \%counts;
}
sub make_raw_counts_from_counts {
my($f) = @_;
my %counts;
my $in = zopen($f);
while (my $line = <$in>) {
my @fields = split(/\s+/, $line);
next if $line =~ /^\s*$/ || $#fields > 1; # Ignore non-unigram counts.
next if $fields[0] =~ /<.*>/; # Skip pseudo words.
$counts{$fields[0]} += $fields[1];
}
close($in);
return \%counts;
}
# Well, the counts from the lm aren't going to be raw. We just have to
# settle for the normalized counts.
#
sub make_raw_counts_from_lm {
my($f) = @_;
my %counts;
my $in = zopen($f);
# Locate unigram section
while (my $line = <$in>) {
last if $line =~ /^\\1-grams:/;
}
# Read in unigram counts
while (my $line = <$in>) {
last if $line =~ /^\\2-grams:/;
my ($logprob, $word) = split(/\s+/, $line);
next if $word =~ /(^\s*$)|(<.*>)/; # Skip pseudo words.
$counts{$word} += 10**$logprob;
}
close($in);
return \%counts;
}
#---------------------------------------------------------------------------
sub getftype {
my($f) = @_;
# First check if it is a sentid file. If necessary insert further checks
# by looking into the file.
#
return "sentid" if ($f =~ /\.sentid(\.gz|\.Z)?$/);
# Extract the first five lines from the file to make our decision.
#
my $in = zopen($f);
for (my $i = 0; $i < 5; $i++) {
$lines[$i] = <$in> || last;
}
close($in);
# Is it a count file? Assume it is and try to falsify from the
# first 5 lines. Format should be -- word count \n
#
my $countfile = 1;
for (my $i = 0; $i < 5; $i++) {
my @words = split(/\s+/, $lines[$i]);
if ($words[$#words] !~ /\d+/) {
$countfile = 0;
last;
}
}
return "counts" if ($countfile == 1);
# Is it an arpa-style language model?
#
my $s = join(' ', @lines);
return "arpa-lm" if ($s =~ /\s*\\data\\\s*ngram\s+1\s*=/);
# Otherwise, assume it is a text file.
#
return "text";
}
# Given log(x) and log(y), this function returns log(x+y).
#
sub logsum {
my($x,$y) = @_;
my $z;
if (!defined($x)) {
$z = $y;
} elsif (!defined($y)) {
$z = $x;
} else {
$z = ($x < $y)? logsum($y,$x) : $x + log(1+exp($y-$x));
}
return $z;
}
sub message {
my($msg) = @_;
return if ($Quiet);
print STDERR "$msg";
}
# Opens a possibly compressed file. Only uncomment the gzip line
# if gzip is available. Otherwise, compressed files aren't supported.
#
sub zopen {
my($f) = @_;
local *IN;
die "$f is not a file.\n" if ! -f $f;
if (!$Gzip) {
open(IN, $f) || die "$f: $!\n";
} else {
open(IN, "gzip -dfc $f |") || die "gzip -dfc $f: $!\n";
}
return *IN;
}
sub usage {
print STDERR <<" .;";
Usage:
$0 [-quiet] [-scale n] -heldout corp_h corp1 corp2 ...
Estimate weighted and combined counts for the words in the vocabulary.
These weights maximize the likelihood of the heldout corpus, corp_h, by
the Witten-Bell smoothed mixture unigram language models from corp_1 through
corp_n.
-quiet stops debug style messages while running.
-scale n causes final combined counts to be scaled by n.
.;
exit 1;
}
#---------------------------------------------------------------------------------
# References.
#
# 1. Venkataraman, A. and W. Wang, (2003). "Techniques for effective vocabulary
# selection", in Proceedings of Eurospeech'03, Geneva, 2003.
#
# 2. Witten, I. H. and T. C. Bell, (1991). "The zero-frequency problem:
# Estimating the probabilities of novel events in adaptive text compression",
# IEEE Trans. IT, 37, pp. 1085-1091.

View File

@@ -0,0 +1,145 @@
#!/usr/local/bin/gawk -f
#
# sentid-to-ctm --
# Format a sentid transcript file into CTM format, faking time marks
# by spacing words evenly across the duration of the segment
#
# Note: this script makes assumptions about the structure of sentence
# ID, specifically, how they encode speakers and timemarks.
#
# $Header: /home/srilm/CVS/srilm/utils/src/sentid-to-ctm.gawk,v 1.11 2019/02/09 07:31:37 stolcke Exp $
#
BEGIN {
# time to leave at edges of segments
delta = 0.07;
pause = "-pau-";
reject = "@reject@";
sort_cmd = "sort -b -k 1,1 -k 2,2 -k 3,3n";
}
# read confidences and/or segment information if given
NR == 1 {
if (confidences) {
while ((getline line < confidences) > 0) {
nvalues = split(line, a);
if (nvalues > 0) {
conf_lines[a[1]] = line;
}
}
}
if (segments) {
while ((getline line < segments) > 0) {
nvalues = split(line, a);
if (nvalues == 5) {
sentid = a[1];
segment_conv[sentid] = a[2];
segment_channel[sentid] = a[3];
segment_start[sentid] = a[4];
segment_end[sentid] = a[5];
}
}
close(segments);
}
}
function is_nonspeech(w) {
return w == pause || w == reject || w ~/^\[.*\]$/ || w ~/^<.*>$/;
}
{
orig_sentid = sentid = $1;
# strip speaker diacritics
sub("_s[1-9]$", "", sentid);
if (segments && sentid in segment_start) {
conv = segment_conv[sentid];
channel = segment_channel[sentid];
start_offset = segment_start[sentid];
end_offset = segment_end[sentid];
# derive channel and time information from sentids
# look for a pattern that encodes channel and
# start/end times
} else if (match(sentid, "_[0-9]_[-0-9][0-9]*_[0-9][0-9]*$")) {
# waveforms with [012] channel id, timemarks 1/1000s
# NOTE: this form is used by the segmenter
conv = substr(sentid, 1, RSTART-1);
split(substr(sentid, RSTART+1), sentid_parts, "_");
channel = sentid_parts[1];
start_offset = sentid_parts[2] / 1000;
end_offset = sentid_parts[3] / 1000;
} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*$")) {
conv = substr(sentid, 1, RSTART-1);
split(substr(sentid, RSTART+1), sentid_parts, "_");
channel = sentid_parts[1];
start_offset = sentid_parts[2] / 100;
end_offset = sentid_parts[3] / 100;
# new sentids used by Ramana for SPINE segmentations
} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*_[-0-9][0-9]*_[0-9][0-9]*$")) {
conv = substr(sentid, 1, RSTART-1);
split(substr(sentid, RSTART+1), sentid_parts, "_");
channel = sentid_parts[1];
start_offset = (sentid_parts[2]+sentid_parts[4]) / 100;
end_offset = (sentid_parts[2]+sentid_parts[5]) / 100;
} else {
print "cannot parse sentid " sentid >> "/dev/stderr";
conv = sentid;
channel = "?";
start_offset = 0;
end_offset = 10000;
}
$1 = "";
$0 = $0;
numwords = NF;
if (numwords > 0) {
word_dur = (end_offset - start_offset - 2 * delta)/numwords;
} else {
word_dur = 0;
}
# find confidence values for this sentid
if (confidences) {
if (!(orig_sentid in conf_lines)) {
print "no confidences for " orig_sentid >> "/dev/stderr";
} else {
delete conf_values;
n_conf_values = \
split(conf_lines[orig_sentid], conf_values);
}
}
for (i = 1; i <= numwords; i ++) {
if (is_nonspeech($i)) continue;
start_time = start_offset + delta + (i - 1) * word_dur;
if (i + 1 in conf_values) {
conf_value = conf_values[i + 1];
} else {
conf_value = 0;
}
# split multiwords
ncomps = split($i, word_comps, "_");
for (j = 1; j <= ncomps; j ++) {
print conv, channel, \
start_time + (j - 1) * word_dur/ncomps,\
word_dur/ncomps, \
toupper(word_comps[j]), \
conf_value | sort_cmd;
}
}
if (orig_sentid in conf_lines && numwords != n_conf_values - 1) {
print "mismatched number of confidences for " orig_sentid \
>> "/dev/stderr";
}
}

View File

@@ -0,0 +1,60 @@
#!/usr/local/bin/gawk -f
#
# sentid-to-sclite --
# convert sentid transcription format to sclite 'trn' format
#
# $Header: /home/srilm/CVS/srilm/utils/src/sentid-to-sclite.gawk,v 1.5 2016/09/23 20:05:51 stolcke Exp $
#
# i.e.:
# sentid word1 word2 ....
#
# becomes
#
# word1 word2 ... (sentid)
#
# The sentid is formatted to contain exactly one underscore,
# as sclite uses the first portion of the id as a speaker label to
# group results.
#
BEGIN {
format_sentids = 1;
}
{
sentid = $1;
$1 = "";
if (format_sentids) {
# reformat sentid
# <conv>_<channel>_<utterance> -> <conv><channel>_<utterance>
sub("[-_]A", "A", sentid);
sub("[-_]B", "B", sentid);
sub("[-_]ch1", "ch1", sentid);
sub("[-_]ch2", "ch2", sentid);
# remove underscore after corpus tag, if any
if (sentid ~ /^[a-z][a-z]*[-_][0-9]/) {
sub("[-_]", "", sentid);
}
# <conv>_<channel>_<utterance> -> <conv><channel>_<utterance>
sub("[-_]A", "A", sentid);
sub("[-_]B", "B", sentid);
sub("[-_]ch1", "ch1", sentid);
sub("[-_]ch2", "ch2", sentid);
# work around problems with negative start times in sentids
sub("_-", "_m", sentid);
#
# for sentid not containing _ or -, fake a speaker id out of the first
# three characters (this works for ATIS ...)
#
if (! (sentid ~ /[-_]/)) {
sentid = substr(sentid, 1, 3) "_" sentid;
}
}
print $0, "(" sentid ")";
}

View File

@@ -0,0 +1,56 @@
#!/usr/local/bin/gawk -f
#
# sort-lm --
# sort the ngrams in an LM in lexicographic order, as required for
# some other LM software (notably CMU's).
#
# usage: sort-lm lm-file > sorted-lm-file
#
# $Header: /home/srilm/CVS/srilm/utils/src/sort-lm.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
#
BEGIN {
sorter = "";
currorder = 0;
}
NF==0 {
print;
next;
}
/^ngram *[0-9][0-9]*=/ {
order = substr($2,1,index($2,"=")-1);
print;
next;
}
/^\\[0-9]-grams:/ {
if (sorter) {
close(sorter);
}
currorder = substr($0,2,1);
print;
fflush();
# set up new sorting pipeline;
sorter = "sort";
for (i = 1; i <= currorder; i ++) {
sorter = sorter " +" i " -" (i+1);
}
# print sorter >> "/dev/stderr";
next;
}
/^\\/ {
if (sorter) {
close(sorter);
sorter = "";
}
currorder = 0;
print; next;
}
currorder && NF > 1 {
print | sorter;
next;
}
{
print;
}

View File

@@ -0,0 +1,57 @@
#!/usr/local/bin/gawk -f
#
# split-tagged-ngrams --
# multiply tagged-word ngrams out into ngrams that contain
# combinations of words and tags
#
# sample input:
# a/A b/B 10
# sample output:
# a b 10
# a B 10
# A b 10
# A B 10
#
# $Header: /home/srilm/CVS/srilm/utils/src/split-tagged-ngrams.gawk,v 1.2 2006/02/11 01:31:32 stolcke Exp $
#
BEGIN {
separator = "/";
}
# recursive expansion of the tagged-word ngram
function expand_ngram(ng, n, suffix, c,
word, tag, word_tag) {
if (n == 0) {
print suffix, c;
} else {
last_item = ng[n];
if (split(last_item, word_tag, separator) == 2) {
word = word_tag[1];
tag = word_tag[2];
expand_ngram(ng, n-1, word " " suffix, c);
expand_ngram(ng, n-1, tag " " suffix, c);
} else {
expand_ngram(ng, n-1, last_item " " suffix, c);
}
}
}
NF > 1 {
count = $NF;
delete ngram;
for (i = 1; i < NF; i ++) {
ngram[i] = $i;
}
expand_ngram(ngram, NF - 1, "", count);
next;
}
{
print;
}

View File

@@ -0,0 +1,42 @@
#!/usr/local/bin/gawk -f
#
# subset-context-ngrams --
# Extract counts corresponding to ngram contexts
#
# usage: subset-context-ngrams contexts=FILE COUNTS > SUBSET
#
# $Header: /home/srilm/CVS/srilm/utils/src/subset-context-ngrams.gawk,v 1.1 2008/09/30 03:54:05 stolcke Exp $
#
# read contexts
NR == 1 {
saveline = $0;
if (contexts != "") {
howmany = 0;
while ((getline < contexts) > 0) {
if (NF < 2) continue;
$NF = "";
subset_contexts[$0 FS] = 1;
howmany ++;
}
print "read " howmany " contexts" > "/dev/stderr";
}
$0 = saveline;
}
NF == 2 {
print;
next;
}
NF > 2 {
saveline = $0;
$NF = $(NF-1) = "";
if ($0 in subset_contexts) {
print saveline;
}
}

View File

@@ -0,0 +1,44 @@
#!/usr/local/bin/gawk -f
#
# subtract-ppls --
# Subtracts text statistics (from -ppl output)
#
# The first input file contains a total, from which subsequent stats are
# discounted. The result is printed in a format compatible with -ppl.
#
# Copyright (c) 1995, SRI International. All Rights Reserved
#
# $Header: /home/srilm/CVS/srilm/utils/src/subtract-ppls.gawk,v 1.2 1997/07/12 05:01:08 stolcke Exp $
#
/^file .*: .* sentences/ {
if (ARGIND == 1) {
totalsents = $3;
totalwords = $5;
totaloovs = $7;
} else {
totalsents -= $3;
totalwords -= $5;
totaloovs -= $7;
}
getline;
if (ARGIND == 1) {
zeroprobs = $1;
totalprob = $4;
} else {
zeroprobs -= $1;
totalprob -= $4;
}
}
END {
M_LN10 = 2.30258509299404568402; # from <math.h>
ppl = exp (- M_LN10 * totalprob / \
(totalwords - totaloovs - zeroprobs + totalsents));
printf "file TOTAL: %d sentences, %d words, %d OOVs\n", \
totalsents, totalwords, totaloovs;
printf "%d zeroprobs, logprob= %g ppl= %g\n", \
zeroprobs, totalprob, ppl;
}

View File

@@ -0,0 +1,13 @@
#!/usr/local/bin/gawk -f
#
# tolower-ngram-counts --
# Map N-gram counts to lowercase
#
# $Header: /home/srilm/CVS/srilm/utils/src/tolower-ngram-counts.gawk,v 1.1 2007/07/13 23:38:22 stolcke Exp $
#
{
for (i = 1; i < NF; i ++) {
$i = tolower($i);
}
print;
}

View File

@@ -0,0 +1,65 @@
#!/usr/local/bin/gawk -f
#
# uniform-classes --
# Assign uniform membership probabilities to word class expansions
# that don't already have probabilities
#
# usage: uniform-clases CLASSFILE > UNIFORM-CLASSFILE
#
# $Header: /home/srilm/CVS/srilm/utils/src/uniform-classes.gawk,v 1.3 2016/05/13 23:00:35 stolcke Exp $
#
BEGIN {
num_class_defs = 0;
}
{
line = $0;
n = split(line, a);
if (n == 0) next;
class = a[1];
num_exp = ++ num_class_expansions[class];
if (a[2] ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
prob = a[2];
i = 3;
} else {
prob = "";
i = 2;
}
expansion = a[i];
for (i++; i <= n; i++) {
expansion = expansion " " a[i];
}
class_expansions[class " " num_exp] = expansion;
if (prob != "") {
class_expansion_probs[class " " num_exp] = prob;
}
num_class_defs ++;
}
END {
print "read " num_class_defs " class expansions" >> "/dev/stderr";
# assign default expansion probs
for (class in num_class_expansions) {
num_exp = num_class_expansions[class];
for (i = 1; i <= num_exp; i ++) {
prob = class_expansion_probs[class " " i];
if (prob == "") {
prob = 1/num_exp;
}
print class, prob, class_expansions[class " " i];
}
}
}

View File

@@ -0,0 +1,36 @@
#!/usr/local/bin/gawk -f
#
# uniq-ngram-counts --
# Collapse identical successive N-grams in counts file
#
# $Header: /home/srilm/CVS/srilm/utils/src/uniq-ngram-counts.gawk,v 1.2 2007/07/13 23:50:28 stolcke Exp $
#
{
if (NF == 1) {
ngram = " ";
} else {
ngram = "";
}
for (i = 1; i < NF; i ++) {
ngram = ngram " " $i;
}
# starting ngrams with space character forces string comparison
if (ngram != last_ngram) {
if (last_ngram != "") {
# avoid outputting initial space
print substr(last_ngram, 2), total_count;
}
total_count = 0;
last_ngram = ngram;
}
total_count += $NF;
}
END {
if (last_ngram != "") {
print substr(last_ngram, 2), total_count;
}
}

View File

@@ -0,0 +1,79 @@
#!/usr/local/bin/gawk -f
#
# vp2text --
# Convert the ARPA CSR vp (verbalized punctiation) format to plain
# text for LM training.
#
# This combines the functionality of Roni Rosenfeld's "vp2svp1" and
# "sgml2text" utilities (except for case mapping). No <s> and </s>
# tags are retained, since our LM software doesn't need them.
#
# $Header: /home/srilm/CVS/srilm/utils/src/vp2text.gawk,v 1.2 1996/09/17 21:59:57 stolcke Exp $
#
BEGIN {
iquote = 0;
nquote = 5;
}
# Reset the quote counter at article boundaries
/^<art\./ {
iquote = 0;
}
/^<DOC/ {
iquote = 0;
}
#
# Filter out SGML tags
#
/^</ {
next;
}
#
# Do all the easy replacements
{
# These are pronounced
gsub("@AT-SIGN", "at");
gsub("&AMPERSAND", "and");
gsub("\\+PLUS", "plus");
gsub("=EQUALS", "equals");
gsub("%PERCENT", "percent");
gsub("/SLASH", "slash");
gsub("\\.POINT", "point");
# These aren't
gsub(",COMMA", "");
gsub("\\?QUESTION-MARK", "");
gsub(":COLON", "");
gsub("\#SHARP-SIGN", "");
gsub("'SINGLE-QUOTE", "");
gsub(";SEMI-COLON", "");
gsub("!EXCLAMATION-POINT", "");
gsub("{LEFT-BRACE", "");
gsub("}RIGHT-BRACE", "");
gsub("\\(LEFT-PAREN", "");
gsub("\\)RIGHT-PAREN", "");
gsub("\\.PERIOD", "");
gsub("\\.\\.\\.ELLIPSIS", "");
gsub("--DASH", "");
gsub("-HYPHEN", "");
}
# Handle lines containing "DOUBLE-QUOTE as a special case since this
# is more costly: replace every nquote'th occurrence with "quote", else
# delete it.
/"DOUBLE-QUOTE/ {
output = "";
for (i = 1; i <= NF; i++) {
if ($i == "\"DOUBLE-QUOTE") {
if ((iquote++) % nquote == 0) {
output = output " quote";
}
} else {
output = output " " $i;
}
}
print output;
next;
}
{
print;
}

View File

@@ -0,0 +1,138 @@
#!/usr/local/bin/gawk -f
#
# wlat-stats --
# Compute statistics of word posterior lattices
#
# $Header: /home/srilm/CVS/srilm/utils/src/wlat-stats.gawk,v 1.6 2019/07/24 16:16:55 stolcke Exp $
#
BEGIN {
name = "";
nhyps = 0;
entropy = 0;
nwords = 0;
ewords = 0; # posterior expected words
nsub = nins = ndel = 0; # 1best error counts
min_errs = 0; # oracle error count
M_LN10 = 2.30258509299404568402;
empty_hyp = "*DELETE*";
total_posterior = 1;
}
$1 == "name" {
name = $2;
next;
}
$1 == "posterior" {
total_posterior = $2;
next;
}
#
# word lattice format:
# node 46 them 11 0.011827 45 0.0111445 13 0.000682478 ...
#
$1 == "node" {
word = $3;
posterior = $5;
if (word != "NULL") {
nhyps ++;
}
if (posterior > 0) {
for (i = 6; i <= NF; i += 2) {
prob = $(i + 1);
if (prob > 0) {
entropy -= prob * log(prob/posterior);
if (word != "NULL") {
ewords += prob;
}
}
}
}
}
#
# confusion network format:
# align 4 okay 0.998848 ok 0.00113834 i 1.06794e-08 a 4.48887e-08 ...
#
$1 == "align" {
align_pos = $2;
best_hyp = "";
best_posterior = 0;
delete all_hyps;
for (i = 3; i <= NF; i += 2) {
word = $i;
if (word != "*DELETE*") {
nhyps ++;
}
prob = $(i + 1);
if (prob > 0) {
entropy -= prob/total_posterior * log(prob/total_posterior);
all_hyps[word] = 1;
if (word != "*DELETE*") {
ewords += prob/total_posterior;
}
}
if (prob > best_posterior) {
best_posterior = prob;
best_hyp = word;
}
}
}
$1 == "reference" && $2 == align_pos {
if ($3 != empty_hyp) {
nwords ++;
if (best_hyp == empty_hyp) {
ndel ++;
} else if (best_hyp != $3) {
nsub ++;
}
} else {
if (best_hyp != empty_hyp) {
nins ++;
}
}
# update oracle error
if (!($3 in all_hyps)) {
min_errs ++;
}
align_pos = -1;
}
END {
printf name (name != "" ? " " : "") \
nhyps " hypotheses " \
entropy/M_LN10 " entropy " \
ewords " ewords";
if (nwords > 0) {
printf " " nwords " words " nhyps/nwords " hyps/word " \
entropy/M_LN10/nwords " entropy/word";
}
printf "\n";
if (nwords > 0) {
nerrors = nsub + nins + ndel;
printf name (name != "" ? " " : "") \
nerrors " errors " nerrors*100/nwords " WER " \
nsub*100/nwords " SUB " nins*100/nwords " INS " \
ndel*100/nwords " DEL\n";
printf name (name != "" ? " " : "") \
min_errs " minerrors " min_errs*100/nwords " minWER\n";
}
}

View File

@@ -0,0 +1,105 @@
#!/usr/local/bin/gawk -f
#
# wlat-to-dot --
# Generate dot(1) graph description from word lattice generates by
# nbest-lattice(1)
#
# usage: wlat-to-dot [show_probs=1] file.wlat > file.dot
#
# $Header: /home/srilm/CVS/srilm/utils/src/wlat-to-dot.gawk,v 1.6 2004/11/02 02:00:35 stolcke Exp $
#
BEGIN {
name = "WLAT";
show_probs = 0;
show_nums = 0;
version = 1;
}
$1 == "name" {
name = $2;
}
#
# nbest-lattice output (without -use-mesh)
#
$1 == "initial" {
print "digraph \"" name "\" {";
print "rankdir = LR";
i = $2;
}
$1 == "final" {
i = $2;
}
$1 == "version" {
version = $2;
}
$1 == "node" && version == 1 {
from = $2;
word = $3;
post = $4;
print "\tnode" from " [label=\"" word \
(!show_nums ? "" : ("/" from)) \
(!show_probs ? "" : "\\n" post ) "\"]";
for (i = 5; i <= NF; i ++) {
to = $i;
print "\tnode" from " -> node" to ";"
}
}
$1 == "node" && version == 2 {
from = $2;
word = $3;
align = $4;
post = $5;
print "\tnode" from " [label=\"" word \
(!show_nums ? "" : ("/" from)) \
"\\n" align \
(!show_probs ? "" : "/" post ) "\"]";
for (i = 6; i <= NF; i += 2) {
to = $i;
print "\tnode" from " -> node" to \
(!show_probs ? "" : " [label=\"" $(i + 1) "\"]") ";"
}
}
#
# nbest-lattice -use-mesh output (confusion networks)
#
$1 == "numaligns" {
print "digraph \"" name "\" {";
print "rankdir = LR";
print "node0 [label=\"" (show_nums ? 0 : "") "\"]";
}
$1 == "align" {
pos = $2;
for (i = 3; i <= NF; i += 2) {
word = $i;
posterior = $(i + 1);
if (posterior == 0) {
print "align " pos ", word " word \
": zero posterior, omitting it" >> "/dev/stderr";
continue;
}
print "node" pos " -> node" (pos + 1) \
" [label=\"" word \
(show_probs ? ("\\n" posterior) : "") \
"\"]";
}
print "node" (pos + 1) " [label=\"" (show_nums ? (pos + 1) : "") "\"]";
}
END {
print "}"
}

Some files were not shown because too many files have changed in this diff Show More