competition update

2025-07-02 12:18:09 -07:00
parent 9e17716a4a
commit 77dbcf868f
2615 changed files with 1648116 additions and 125 deletions
--- a/language_model/srilm-1.7.3/utils/doc/.keepme
+++ b/language_model/srilm-1.7.3/utils/doc/.keepme
--- a/language_model/srilm-1.7.3/utils/obj/i686-m64/STAMP
+++ b/language_model/srilm-1.7.3/utils/obj/i686-m64/STAMP
--- a/language_model/srilm-1.7.3/utils/src/Makefile
+++ b/language_model/srilm-1.7.3/utils/src/Makefile
@@ -0,0 +1,226 @@
+#
+#    File:   Makefile.example
+#    Author: The SRI DECIPHER (TM) System
+#    Date:   Thu Sep  9 12:04:47 1993
+#
+#    Description:
+#	This is the example makefile to start from when adding new
+#       modules to the DECIPHER System.  To use this makefile, first
+#       copy it to your directory as the file "Makefile".  Second,
+#       replace the word "Example" in the text below with the real name
+#	of your library.  Next replace the the example filenames with
+#	the names of your actual declarations and source files in the
+#	appropriate variable definitions.  Finally clean up by deleting
+#	any lines not relevant to your module and updating this header
+#       to describe your new module.  Do not forget to use the proper
+#       RCS keywords!
+#
+#    Copyright (c) 1993, SRI International.  All Rights Reserved.
+#
+#    $Header: /home/srilm/CVS/srilm/utils/src/Makefile,v 1.76 2019/02/09 07:36:09 stolcke Exp $
+#
+
+# Include common SRILM variable definitions.
+include $(SRILM)/common/Makefile.common.variables
+
+# This should enable locale-specific string collation for vocabulary sorting
+# (it will slow things down somewhat).
+#ADDITIONAL_CXXFLAGS = -Dstrcmp=strcoll
+
+# Flags for generating "compact" data structures
+COMPACT_FLAGS += -DUSE_SARRAY -DUSE_SARRAY_TRIE -DUSE_SARRAY_MAP2
+
+# Flags for generating "short" data structures
+SHORT_FLAGS = $(COMPACT_FLAGS) -DUSE_SHORT_VOCAB -DUSE_XCOUNTS
+
+# Flags for generating "long long" data structures
+LLONG_FLAGS = $(COMPACT_FLAGS) -DUSE_LONGLONG_COUNTS -DUSE_XCOUNTS
+
+# enable use of liblbfgs if indicated
+ifneq ($(HAVE_LIBLBFGS), )
+  ADDITIONAL_CFLAGS += -DHAVE_LIBLBFGS
+  ADDITIONAL_CXXFLAGS += -DHAVE_LIBLBFGS
+endif
+
+ADDITIONAL_LDFLAGS += \
+	$(MATHERR_LINK)
+
+ADDITIONAL_LIBRARIES += \
+	$(SRILM_LIBDIR)/$(LIB_PREFIX)oolm$(LIB_SUFFIX) \
+	$(SRILM_LIBDIR)/$(LIB_PREFIX)dstruct$(LIB_SUFFIX) \
+	$(SRILM_LIBDIR)/$(LIB_PREFIX)misc$(LIB_SUFFIX) \
+	$(SRILM_LIBDIR)/$(LIB_PREFIX)z$(LIB_SUFFIX) \
+	$(MATH_LIBRARY) \
+	$(LBFGS_LIBRARY)
+
+# Exported programs.
+REAL_PROGRAM_NAMES = \
+	nbest-rover-helper
+
+# Example programs.
+PROGRAM_NAMES = $(REAL_PROGRAM_NAMES) 
+
+PROGRAMS = $(PROGRAM_NAMES:%=$(BINDIR)/%$(EXE_SUFFIX))
+
+PROGRAM_SOURCES = $(foreach prog,$(PROGRAM_NAMES),\
+			$(wildcard $(SRCDIR)/$(prog).c) \
+			$(wildcard $(SRCDIR)/$(prog).cc))
+PROGRAM_OBJECTS = $(PROGRAM_NAMES:%=$(OBJDIR)/%$(OBJ_SUFFIX))
+
+# Libraries to be linked with the Example programs.
+LIBRARIES = $(LIBRARY) \
+	    $(ADDITIONAL_LIBRARIES)
+
+# All of the types of files.
+
+ALL_SOURCES = $(PROGRAM_SOURCES)
+
+ALL_OBJECTS = $(PROGRAM_OBJECTS)
+
+ALL_PROGRAMS = $(PROGRAMS)
+
+ALL_PROGRAM_NAMES = $(PROGRAM_NAMES)
+
+#
+
+SCRIPTS = \
+	rescore-nbest \
+	wordlat-to-lisp \
+	extract-skip-probs \
+	$(EXPORTED_SCRIPTS)
+
+EXPORTED_SCRIPTS = \
+	change-lm-vocab \
+	empty-sentence-lm \
+	rescore-decipher \
+	rescore-acoustic \
+	rescore-reweight \
+	rescore-minimize-wer \
+	make-batch-counts \
+	merge-batch-counts \
+	make-big-lm \
+	make-multiword-pfsg \
+	pfsg-from-ngram \
+	nbest-error \
+	nbest-rover \
+	search-rover-combo \
+	rexport.gnumake \
+	align-with-tags \
+	compute-sclite \
+	compute-sclite-nbest \
+	compare-sclite \
+	cumbin
+
+# scripts that need to be edited before installation
+EDIT_SCRIPTS = \
+	add-classes-to-pfsg \
+	add-dummy-bows \
+	add-pauses-to-pfsg \
+	add-ppls \
+	bytelog-to-log10 \
+	classes-to-fsm \
+	combine-acoustic-scores \
+	combine-rover-controls \
+	rover-control-weights \
+	rover-control-tying \
+	compare-ppls \
+	compute-best-mix \
+	compute-best-rover-mix \
+	compute-best-sentence-mix \
+	compute-oov-rate \
+	concat-sausages \
+	context-ngrams \
+	continuous-ngram-count \
+	de-vq-lm \
+	extract-skip-probs \
+	filter-event-counts \
+	find-reference-posteriors \
+	fix-ctm \
+	fsm-to-pfsg \
+	get-gt-counts \
+	get-unigram-probs \
+	hits-from-log \
+	log10-to-bytelog \
+	make-abs-discount \
+	make-diacritic-map \
+	make-google-ngrams \
+	make-gt-discounts \
+	make-kn-discounts \
+	make-kn-counts \
+	make-hiddens-lm \
+	make-lm-subset \
+	make-nbest-pfsg \
+	make-ngram-pfsg \
+	make-sub-lm \
+	metadb \
+	sort-lm \
+	reverse-lm \
+	merge-nbest \
+	nbest-posteriors \
+	nbest2-to-nbest1 \
+	nbest-optimize-args-from-rover-control \
+	nbest-oov-counts \
+	nbest-vocab \
+	nbest-words \
+	pfsg-to-dot \
+	pfsg-to-fsm \
+	pfsg-vocab \
+	htklat-vocab \
+	ppl-from-log \
+	remove-lowprob-ngrams \
+	replace-unk-words \
+	replace-words-with-classes \
+	reverse-text \
+	reverse-ngram-counts \
+	sentid-to-sclite \
+	sentid-to-ctm \
+	split-tagged-ngrams \
+	subset-context-ngrams \
+	subtract-ppls \
+	tolower-ngram-counts \
+	uniform-classes \
+	uniq-ngram-counts \
+	vp2text \
+	wlat-to-dot \
+	wlat-to-pfsg \
+	wlat-stats \
+	wordlat-to-lisp \
+	prettify \
+	select-vocab 
+
+
+# Define targets.
+
+all:	$(PROGRAMS)
+
+$(LIBRARY):	$(LIB_OBJECTS)
+	$(ARCHIVE) $(AR_OUTPUT_OPTION) $^ $(DEMANGLE_FILTER)
+	$(RANLIB) $@ $(DEMANGLE_FILTER)
+
+$(PROGRAMS): $(LIBRARY) $(OTHER_LIBRARIES)
+
+# Variables and Targets for released system
+
+EXPORTED_PROGRAMS  = \
+	$(EDIT_SCRIPTS:%=$(BINDIR)/%) \
+	$(REAL_PROGRAM_NAMES:%=$(BINDIR)/%$(EXE_SUFFIX))
+
+release:	release-scripts release-programs
+
+# Include common SRILM target definitions.
+include $(SRILM)/common/Makefile.common.targets
+
+#
+# Rule to create edited gawk script 
+#
+$(BINDIR)/%: $(SRCDIR)/%.gawk $(BINDIR_STAMP) $(SRILM)/common/Makefile.machine.$(MACHINE_TYPE)
+	sed -e '1s,/usr/local/bin/gawk,$(GAWK),' $< >$@.new
+	mv $@.new $@
+
+#
+# Rule to create edited perl script 
+#
+$(BINDIR)/%: $(SRCDIR)/%.pl $(BINDIR_STAMP) $(SRILM)/common/Makefile.machine.$(MACHINE_TYPE)
+	sed -e '1s,/usr/local/bin/perl,$(PERL),' $< >$@.new
+	mv $@.new $@
+
--- a/language_model/srilm-1.7.3/utils/src/add-classes-to-pfsg.gawk
+++ b/language_model/srilm-1.7.3/utils/src/add-classes-to-pfsg.gawk
@@ -0,0 +1,172 @@
+#!/usr/local/bin/gawk -f
+#
+# add-classes-to-pfsg --
+#	Modify Decipher PFSG by expanding class nodes with words
+#
+# usage: add-classes-to-pfsg classes=<expansions> pfsg > expanded-pfsg
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/add-classes-to-pfsg.gawk,v 1.5 2004/11/02 02:00:35 stolcke Exp $
+#
+
+function read_classes(file) {
+	
+    num_class_defs = 0;
+    delete num_class_expansions;
+    delete class_expansions;
+    delete class_expansion_probs;
+
+    while ((getline line < file) > 0) {
+
+	n = split(line, a);
+	if (n == 0) continue;
+
+	class = a[1];
+	num_exp = ++ num_class_expansions[class];
+
+	if (a[2] ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
+		prob = a[2];
+		i = 3;
+	} else {
+		prob = "";
+		i = 2;
+	}
+	
+	expansion = a[i];
+	for (i++; i <= n; i++) {
+	    expansion = expansion " " a[i];
+	}
+
+	class_expansions[class " " num_exp] = expansion;
+	if (prob != "") {
+	    class_expansion_probs[class " " num_exp] = prob;
+	}
+	num_class_defs ++;
+    }
+
+    print "read " num_class_defs " class expansions" >> "/dev/stderr";
+
+    # assign default expansion probs
+
+    for (class in num_class_expansions) {
+
+	num_exp =  num_class_expansions[class];
+
+	for (i = 1; i <= num_exp; i ++) {
+	    if (class_expansion_probs[class " " i] == "") {
+		class_expansion_probs[class " " i] = 1/num_exp;
+	    }
+	}
+	
+    }
+}
+
+######################################################################
+
+BEGIN {
+    logscale = 10000.5;
+    round = 0.5;
+
+    null = "NULL";
+
+    classes_toupper = 1;	# map class names to upper case
+}
+
+function rint(x) {
+    if (x < 0) {
+	return int(x - round);
+    } else {
+	return int(x + round);
+    }
+}
+
+function scale_prob(x) {
+    return rint(log(x) * logscale);
+}
+
+function print_class_pfsg(class) {
+    print "name " (classes_toupper ? toupper(class) : class);
+
+    # compute total number of nodes needed
+    num_exp =  num_class_expansions[class];
+    num_words = 0;
+    all_words = "";
+    for (i = 1; i <= num_exp; i ++) {
+	num_words += split(class_expansions[class " " i], a);
+	all_words = all_words " " class_expansions[class " " i];
+    }
+
+    print "nodes " (num_words + 2) " " null " " null all_words;
+
+    initial = 0;
+    final = 1;
+    print "initial " initial;
+    print "final " final;
+
+    print "transitions " (num_words + num_exp);
+
+    node_index = final;
+
+    for (i = 1; i <= num_exp; i ++) {
+	n = split(class_expansions[class " " i], a);
+	if (n == 0) {
+	    print initial, final, \
+		    scale_prob(class_expansion_probs[class " " i]);
+	} else {
+	    print initial, ++node_index, \
+		    scale_prob(class_expansion_probs[class " " i]);
+
+	    for (k = 2; k <= n; k ++) {
+		print node_index, node_index + 1, 0;
+		node_index ++;
+	    }
+
+	    print node_index, final, 0;
+	}
+    }
+
+    print "";
+}
+
+NR == 1 {
+    if (classes) {
+	read_classes(classes);
+    }
+    close(classes);
+}
+
+# record class names used in PFSGs
+$1 == "nodes" {
+    for (i = 3; i <= NF; i ++) {
+	if ($i != null && $i in num_class_expansions) {
+	    class_used[$i] = 1;
+	    if (classes_toupper) {
+		upper_class = toupper($i);
+
+		if ($i != upper_class && upper_class in num_class_expansions) {
+		    print "cannot map class " $i \
+			" to uppercase due to name conflict" >> "/dev/stderr";
+		    exit 1;
+		}
+
+		$i = upper_class;
+	    }
+	}
+    }
+    print;
+    next;
+}
+
+# pass old PFSGs through unchanged
+{
+    print;
+}
+	
+# dump out class PFSGs
+END {
+    print "";
+
+    for (class in class_used) {
+	print_class_pfsg(class);
+    }
+}
+
--- a/language_model/srilm-1.7.3/utils/src/add-dummy-bows.gawk
+++ b/language_model/srilm-1.7.3/utils/src/add-dummy-bows.gawk
@@ -0,0 +1,35 @@
+#!/usr/local/bin/gawk -f
+#
+# add-dummy-bows --
+#	add redundant backoff weights to model file to make some broken
+#	programs happy.
+#	(Normally a backoff weight is only required for ngrams that
+#	are prefixes of longer ngrams.)
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/add-dummy-bows.gawk,v 1.1 1995/09/20 17:36:30 stolcke Exp $
+#
+
+NF==0 {
+	print; next;
+}
+/^ngram *[0-9][0-9]*=/ {
+	order = substr($2,1,index($2,"=")-1);
+	if (order > highorder) highorder = order;
+	print;
+	next;
+}
+/^.[0-9]-grams:/ {
+	currorder=substr($0,2,1);
+}
+/^\\/ {
+	print; next;
+}
+currorder && currorder < highorder {
+	if (NF < currorder + 2) {
+		print $0 "\t0";
+	} else {
+		print;
+	}
+	next;
+}
+{ print }
--- a/language_model/srilm-1.7.3/utils/src/add-pauses-to-pfsg.gawk
+++ b/language_model/srilm-1.7.3/utils/src/add-pauses-to-pfsg.gawk
@@ -0,0 +1,171 @@
+#!/usr/local/bin/gawk -f
+#
+# add-pauses-to-pfsg --
+#	Modify Decipher PFSG to allow pauses between words
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/add-pauses-to-pfsg.gawk,v 1.15 2015-07-03 03:44:52 stolcke Exp $
+#
+BEGIN {
+	pause = "-pau-";
+	top_level_name = "TOP_LEVEL";
+	pause_filler_name = "PAUSE_FILLER";
+	null = "NULL";
+
+	wordwrap = 1;		# wrap pause filler around words
+	pauselast = 0;		# make pauses follow wrapped words
+	version = 0;		# no "version" line by default
+}
+
+#
+# output the TOP_LEVEL model
+#	oldname is the name of the original pfsg
+function print_top_level(oldname) {
+	if (version) {
+		print "version " version "\n";
+	}
+	print "name " top_level_name;
+	if (pauselast) {
+	    print "nodes 4 " null " " pause_filler_name " " oldname " " null;
+	} else {
+	    print "nodes 4 " null " " oldname " " pause_filler_name " " null;
+	}
+	print "initial 0"
+	print "final 3"
+	print "transitions 4"
+	print "0 1 0"
+	print "1 2 0"
+	if (pauselast) {
+	    print "0 2 0"
+	} else {
+	    print "1 3 0"
+	}
+	print "2 3 0"
+	print "";
+}
+
+function word_wrapper_name(word) {
+	return "_" word "_PF";
+}
+
+#
+# output a pause wrapper for word
+#
+function print_word_wrapper(word) {
+	print "name " word_wrapper_name(word);
+	if (pauselast) {
+	    print "nodes 3 " word " " pause_filler_name " " null;
+	} else {
+	    print "nodes 3 " null " " pause_filler_name " " word;
+	}
+	print "initial 0";
+	print "final 2";
+	print "transitions 3";
+	print "0 1 0";
+	print "1 2 0";
+	print "0 2 0";
+	print "";
+}
+
+#
+# output the pause filler
+#
+function print_pause_filler() {
+	print "name " pause_filler_name;
+	print "nodes 3 " null " " pause " " null;
+	print "initial 0";
+	print "final 2";
+	print "transitions 3";
+	print "0 1 0";
+	print "1 1 0";
+	print "1 2 0";
+}
+
+NF == 0 {
+	print;
+	next;
+}
+
+#
+# read vocabulary list if supplied
+#
+NR == 1 && vocab != "" {
+	while (getline line < vocab) {
+	    if (split(line, a)) {
+		word_list[a[1]] = 1;
+	    }
+	}
+	close (vocab);
+}
+
+#
+# check that a node name is word
+# if a vocabulary was not specified we use the following heuristic:
+# word nodes contain at least one lowercase or non-ascii character and are not
+# surrounded by "*...*" (which indicates a class name).
+#
+function is_word(w) {
+	if (vocab) {
+	    return w in word_list;
+	} else {
+	    return !is_classname(w);
+	}
+}
+
+function is_classname(w) {
+	return w ~ /^\*.*\*$/ || !(w ~ /[[:lower:]]/ || w ~ /[^\x00-\x7F]/);
+}
+
+#
+# first time we see a pfsg name, issue a top-level wrapper for it.
+#
+$1 == "name" && !have_top_level {
+	print_top_level($2);
+	print;
+	have_top_level = 1;
+	next;
+}
+
+#
+# maps word nodes to wrapper nodes
+#
+$1 == "nodes" {
+	numnodes = $2;
+	printf "nodes %d", numnodes;
+
+	for (i = 0; i < numnodes; i ++) {
+	    node_name = $(i + 3);
+
+	    # if it contains lowercase characters it's a word and
+	    # needs to wrapped
+	    if (wordwrap && is_word(node_name) && \
+		node_name != pause && node_name != null)
+	    {
+		if (!(node_name in all_words)) {
+		    all_words[node_name] = 1;
+		    words[++num_words] = node_name;
+		}
+		printf " %s", word_wrapper_name(node_name);
+	    } else {
+		printf " %s", node_name;
+	    }
+	}
+	printf "\n";
+	next;
+}
+
+{
+	print;
+}
+
+END {
+	#
+	# output the word wrappers
+	#
+	if (wordwrap) {
+	    for (i = 1; i <= num_words; i ++) {
+		print_word_wrapper(words[i]);
+	    }
+	}
+
+	print_pause_filler();
+}
--- a/language_model/srilm-1.7.3/utils/src/add-ppls.gawk
+++ b/language_model/srilm-1.7.3/utils/src/add-ppls.gawk
@@ -0,0 +1,30 @@
+#!/usr/local/bin/gawk -f
+#
+# add-ppls --
+#	Add text statistics (from -ppl output)
+#
+# Copyright (c) 1995,1997 SRI International.  All Rights Reserved
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/add-ppls.gawk,v 1.2 1997/07/12 05:01:08 stolcke Exp $
+#
+/^file .*: .* sentences/ {
+	totalsents += $3;
+	totalwords += $5;
+	totaloovs += $7;
+
+	getline;
+
+	zeroprobs += $1;
+	totalprob += $4;
+}
+END {
+	M_LN10 = 2.30258509299404568402;        # from <math.h>
+
+	ppl = exp (- M_LN10 * totalprob / \
+			(totalwords - totaloovs - zeroprobs + totalsents));
+
+	printf "file TOTAL: %d sentences, %d words, %d OOVs\n", \
+			totalsents, totalwords, totaloovs;
+	printf "%d zeroprobs, logprob= %g ppl= %g\n", \
+			zeroprobs, totalprob, ppl;
+}
--- a/language_model/srilm-1.7.3/utils/src/align-with-tags
+++ b/language_model/srilm-1.7.3/utils/src/align-with-tags
@@ -0,0 +1,194 @@
+#!/bin/sh
+#
+# align-with-tags --
+#	align reference transcript with tags to hypothesized 
+#	transcripts, merging the tags into the latter
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/align-with-tags,v 1.7 2015-07-03 03:45:38 stolcke Exp $
+#
+
+usage () {
+	echo "usage: $0 [-r ref -h hyp] [-dictionary D] [-aligndir A] [-options...]" >&2
+	exit 2;
+}
+
+ref=/dev/null
+hyp=/dev/null
+dictionary=/dev/null
+
+while [ $# -gt 0 ]; do
+	case "$1" in
+	-r)	ref="$2"
+		shift; shift;;
+	-h)	hyp="$2"
+		shift; shift;;
+	-dictionary)
+		dictionary=$2
+		shift; shift;;
+	-aligndir)
+		aligndir=$2
+		shift; shift;;
+	-\?)	usage;;
+	-*)	pass_options="$pass_option $1"
+		shift;;
+	*)	break;;
+	esac
+done
+
+if [ $# -ge 2 ]; then
+	ref="$1"
+	hyp="$2"
+elif [ $# -gt 0 ]; then
+	usage;
+fi
+
+tmpdir=${TMPDIR-/tmp}
+tmpdict="$tmpdir/dict$$"
+tmptags="$tmpdir/tags$$"
+tmprefs="$tmpdir/refs$$"
+tmphyps="$tmpdir/hyps$$"
+tmpnbest="$tmpdir/nbest$$"
+tmpmerge="$tmpdir/merged$$"
+
+if [ -n "$aligndir" ]; then
+	tmpmerge=
+fi
+
+trap "rm -rf $tmpdict $tmptags $tmprefs $tmphyps $tmpnbest $tmpmerge; exit" 0 1 2 15
+
+if [ -n "$aligndir" ]; then
+	mkdir -p $aligndir
+	tmpmerge=$aligndir
+fi
+
+prepare_text () {
+	${GAWK-gawk} -v tag_file=$2 '
+	BEGIN {
+		tag_list["<default>"] = 1;
+	}
+	function is_tag(x) {
+		return (x ~ /^<.*>$/);
+	}
+	{
+		for (i = 2; i <= NF; i ++) {
+			if (is_tag($i)) {
+				tag_list[$i] = 1;
+			} else {
+				$i = tolower($i);
+			}
+			if (!is_tag($(i - 1)) && !is_tag($i)) {
+				$(i - 1) = $(i - 1) " <default>";
+			}
+		}
+		if (!is_tag($NF)) {
+			$NF = $NF " <default>";
+		}
+		print $0;
+	}
+	END {
+		if (tag_file) {
+			for (tag in tag_list) {
+				print tag > tag_file;
+			}
+		}
+	}' $1;
+}
+
+parse_alignment () {
+	gzip -d -c -f < $1 | \
+	${GAWK-gawk} -v sentid=$2 'BEGIN {
+		output = sentid;
+
+		show_refs = 1;
+	}
+
+	function is_empty(x) {
+		return x == "<default>" || tolower(x) == "*delete*";
+	}
+
+	function is_tag(x) {
+		return x ~ /^<.*>$/;
+	}
+
+	$1 == "align" {
+		if (NF == 4 && $4 == 1) {
+			# matching hyp and ref
+			if (!is_empty($3)) {
+				output = output " " $3;
+			}
+		} else if (NF == 6 && $4 == 1 && $6 == 0) {
+			# mismatched hyp and ref
+			if (is_empty($3)) {
+				if (is_tag($5)) {
+					if (!is_empty($5)) \
+						output = output " " $5;
+				} else if (showrefs) {
+					output = output " (" $5 ")";
+				}
+			} else {
+				if (is_empty($5) || !showrefs) {
+					output = output " " $3;
+				} else {
+					output = output " " $3 " (" $5 ")";
+				}
+			}
+		} else  {
+			print "unexpected alignment: " $0 > "/dev/stderr";
+		}
+	}
+	END {
+		print output;
+	}'
+}
+
+set -e
+
+#
+# format hyps and refs for alignment
+#
+prepare_text $ref $tmptags > $tmprefs
+prepare_text $hyp > $tmphyps
+
+#
+# add tag pronunciations to the dictionary
+#
+if [ $dictionary != /dev/null ]; then 
+	gzip -d -c -f $dictionary > $tmpdict
+else
+	> $tmpdict
+fi
+${GAWK-gawk} '{ print $1, "**TAG**" }' $tmptags >> $tmpdict
+
+#
+# do the alignments
+#
+mkdir -p $tmpnbest $tmpmerge
+
+cat $tmphyps | \
+while read sentid words
+do
+	echo "0 0 0 $words" > $tmpnbest/$sentid
+
+	echo $tmpnbest/$sentid
+done | \
+nbest-lattice -nbest-files - \
+	-use-mesh \
+	-dictionary $tmpdict \
+	-keep-noise \
+	-refs "$tmprefs" \
+	$pass_options \
+	-write-dir $tmpmerge | \
+(
+	last_sentid=
+	while read sentid rest
+	do
+		if [ -n "$last_sentid" ]; then
+			parse_alignment $tmpmerge/$last_sentid.gz $last_sentid
+		fi
+		last_sentid=$sentid
+	done
+	if [ -n "$last_sentid" ]; then
+		parse_alignment $tmpmerge/$last_sentid.gz $last_sentid
+	fi
+)
+
--- a/language_model/srilm-1.7.3/utils/src/bytelog-to-log10.gawk
+++ b/language_model/srilm-1.7.3/utils/src/bytelog-to-log10.gawk
@@ -0,0 +1,19 @@
+#!/usr/local/bin/gawk -f
+#
+# bytelog-to-log10 --
+#	convert bytelog scores to log-base-10
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/bytelog-to-log10.gawk,v 1.2 2002/05/15 04:47:13 stolcke Exp $
+#
+BEGIN {
+	logscale = 2.30258509299404568402 * 10000.5 / 1024.0;
+	scale = 1;
+}
+{
+	for (i = 1; i <= NF; i ++) {
+	    if ($i ~ /^[-+]+[0-9][0-9]*$/) {
+		    $i = $i / scale / logscale;
+	    }
+	}
+	print;
+}
--- a/language_model/srilm-1.7.3/utils/src/change-lm-vocab
+++ b/language_model/srilm-1.7.3/utils/src/change-lm-vocab
@@ -0,0 +1,78 @@
+#!/bin/sh
+#
+# change-lm-vocab --
+#	create a language model from an existing one by changing its
+#	vocabulary.
+#	All n-grams in the new vocab are retained with their original
+#	probabilities.  Backoff weights are recomputed and backed-off
+#	unigrams for all new words are added.
+#	-subset option performs subsetting of the vocabulary without adding
+#	new words.
+#
+# usage: change-lm-vocab [-subset] -vocab vocabfile -lm oldlm -write-lm newlm
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/change-lm-vocab,v 1.9 2013/03/09 07:13:01 stolcke Exp $
+#
+
+oldlm=-
+newlm=-
+vocab=/dev/null
+
+while [ $# -gt 0 ]; do
+	case "$1" in
+	-vocab)	vocab="$2" ; shift ;;
+	-lm)	oldlm="$2" ; shift ;;
+	-write-lm)	newlm="$2" ; shift ;;
+	-tolower) options="$options $1" ; tolower=1 ;;
+	-subset)  subset=yes ;;
+	*)	options="$options $1" ;;
+	esac
+	shift
+done
+
+# -subset prevents new words being added to the LM
+if [ "$subset" ]; then
+	ngram_vocab="/dev/null"
+else
+	ngram_vocab="$vocab"
+fi
+
+gzip -dcf $oldlm | ${GAWK-gawk} '
+# read the vocab file
+NR == 1 && vocab {
+	# always include sentence begin/end
+	is_word["<s>"] = is_word["</s>"] = 1;
+
+	while ((getline word < vocab) > 0) {
+		is_word[to_lower ? tolower(word) : word] = 1;
+	}
+		
+	close(vocab);
+}
+# process old lm
+NF==0 {
+	print; next;
+}
+/^ngram *[0-9][0-9]*=/ {
+	order = substr($2,1,index($2,"=")-1);
+	print;
+	next;
+}
+/^\\[0-9]-grams:/ {
+	currorder=substr($0,2,1);
+	print;
+	next;
+}
+/^\\/ {
+	print; next;
+}
+currorder {
+	for (i = 2 ; i <= currorder + 1; i ++) {
+		if (!((to_lower ? tolower($i) : $i) in is_word)) next;
+	}
+	print;
+	next;
+}
+{ print }
+' vocab=$vocab to_lower=$tolower | \
+ngram -lm - -vocab "$ngram_vocab" -renorm -write-lm "$newlm" $options
--- a/language_model/srilm-1.7.3/utils/src/classes-to-fsm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/classes-to-fsm.gawk
@@ -0,0 +1,134 @@
+#!/usr/local/bin/gawk -f
+#
+# usage: classes-to-fsm [symbolic=1] [isymbolfile=ISYMBOLS] [osymbolfile=OSYMBOLS] \
+#			vocab=VOCAB CLASSES > class.fsm
+#
+# where ISYMBOLS is the input symbol table, OSYMBOLS is the output symbol table
+# VOCAB is the word list 
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/classes-to-fsm.gawk,v 1.1 1999/09/27 01:10:27 stolcke Exp $
+# 
+BEGIN {
+    empty_input = "NULL";
+    empty_output = "NULL";
+    input_symbols[empty_input] = 0;
+    output_symbols[empty_output] = 0;
+    numinputs = 1;
+    numoutputs = 1;
+
+    isymbolfile = "";
+    osymbolfile = "";
+    symbolic = 0;
+
+    startstate = 0;
+    numstates = 1;
+
+    M_LN10 = 2.30258509299404568402;	# from <math.h>
+    logscale = 10000.5;
+    round = 0.5;
+}
+
+NR == 1 {
+    # print start/end state
+    print startstate;
+
+    if (vocab) {
+	while ((getline vline < vocab) > 0) {
+	    if (split(vline, a) >= 1) {
+		word = a[1];
+		input_symbols[word] = numinputs ++;
+		output_symbols[word] = numoutputs ++;
+
+		# print identity transition for vocab words
+		print startstate, startstate, \
+			    (symbolic ? word : input_symbols[word]), \
+			    (symbolic ? word : output_symbols[word]);
+	    }
+	}
+	    
+    }
+}
+
+function rint(x) {
+    if (x < 0) {
+	return int(x - round);
+    } else {
+	return int(x + round);
+    }
+}
+
+function scale_prob(x) {
+    return rint(log(x) * logscale);
+	# return log(x) / M_LN10;
+}
+
+# input format is
+# 	CLASS	[PROB]	WORD1 WORD2 ... WORDN
+{
+    if (NF == 0) {
+	    next;
+    }
+
+    class = $1;
+
+    if (!(class in input_symbols)) {
+	input_symbols[class] = numinputs++;
+    }
+
+    if ($2 ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
+	prob = $2;
+	first = 3;
+    } else {
+	prob = 1;
+	first = 2;
+    }
+
+    # deal with empty class expansion: map class to NULL
+    if (first > NF) {
+	print startstate, startstate, \
+		(symbolic ? class : input_symbols[class]), \
+		(symbolic ? empty_output : 0), \
+		-scale_prob(prob);
+    }
+
+    for (i = first; i <= NF; i ++) {
+	if (!($i in output_symbols)) {
+	    output_symbols[$i] = numoutputs ++;
+	}
+
+	if (i == NF) {
+	    next_state = startstate;
+	} else {
+	    next_state = numstates ++;
+	}
+
+	if (i == first) {
+	    print startstate, next_state,
+		    (symbolic ? class : input_symbols[class]), \
+		    (symbolic ? $i : output_symbols[$i]), \
+		    -scale_prob(prob);
+	} else {
+	    print last_state, next_state,
+		    (symbolic ? empty_input : 0), \
+		    (symbolic ? $i : output_symbols[$i]), \
+		    -scale_prob(1);
+	}
+
+	last_state = next_state;
+    }
+}
+
+END {
+    if (isymbolfile) {
+	for (word in input_symbols) {
+		print word, input_symbols[word] > isymbolfile;
+	}
+	close(isymbolfile);
+    }
+    if (osymbolfile) {
+	for (word in output_symbols) {
+		print word, output_symbols[word] > osymbolfile;
+	}
+	close(osymbolfile);
+    }
+}
--- a/language_model/srilm-1.7.3/utils/src/combine-acoustic-scores.gawk
+++ b/language_model/srilm-1.7.3/utils/src/combine-acoustic-scores.gawk
@@ -0,0 +1,114 @@
+#!/usr/local/bin/gawk -f
+#
+# combine acoustic scores in nbest lists with additional acoustic score files
+# (used by rescore-acoustic and nbest-rover)
+#
+# Setting the "max_nbest" limits the number of hyps retrieved from each
+# input score list.
+# If max_nbest is set and an additional score file contains less values
+# than the nbest list is long, missing values are filled in with the
+# minimal score found in that file.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/combine-acoustic-scores.gawk,v 1.9 2019/02/22 20:55:10 stolcke Exp $
+#
+function get_from_file(i) {
+	if (ARGV[i] ~ /\.gz$/) {
+		status = (("exec gzip -dc " ARGV[i]) | getline);
+	} else {
+		status = (getline < ARGV[i]);
+	}
+	if (status < 0) {
+		print "error reading from " ARGV[i] >> "/dev/stderr";
+		exit 1;
+	}
+	return status;
+}
+
+BEGIN {
+	hypno = 0;
+
+	sentid = ARGV[1];
+	sub(".*/", "", sentid);
+	sub("\\.gz$", "", sentid);
+	sub("\\.score$", "", sentid);
+
+	bytelogscale = 1024.0 / 10000.5 / 2.30258509299404568402;
+
+	nweights = split(weights, weight);
+	if (nweights != ARGC - 1) {
+		print "number of weights doesn't match number of score files" \
+					>> "/dev/stderr";
+		exit 1;
+	}
+
+	# format of input nbest list
+	nbestformat = 0;
+
+	while ((max_nbest == 0 || hypno < max_nbest) && get_from_file(1)) {
+
+		if ($1 == "NBestList1.0") {
+			nbestformat = 1;
+			print;
+			continue;
+		} else if ($1 == "NBestList2.0") {
+			nbestformat = 2;
+			print;
+			continue;
+		}
+
+		old_ac = $1; $1 = "";
+		if (nbestformat > 0) {
+			# Decipher nbest format: just use the aggregate
+			# score as the acoustic score
+			# For version 2 format, the total score is updated,
+			# reflecting the change in acoustic scores.
+			# Other programs recover the acoustic score as the
+			# difference of the total score and the accumulated
+			# LM scores, so this gives the right results.
+			gsub("[()]", "", old_ac);
+			old_ac *= bytelogscale;
+		}
+			
+		hyp = $0;
+
+		total_ac = weight[1] * old_ac;
+		for (i = 2; i < ARGC; i ++) {
+			if (!get_from_file(i)) {
+				if (max_nbest == 0) {
+					print "missing score in " ARGV[i] \
+						 >> "/dev/stderr";
+					exit 2
+				} else {
+					new_ac = min_score[i];
+				}
+			} else {
+				# skip nbest header
+				if ($1 ~ /NBestList/) {
+					i --; 
+					continue;
+				}
+
+				new_ac = $1;
+
+				# handle decipher-style scores
+				if (new_ac ~ /\(.*\)/) {
+					gsub("[()]", "", new_ac);
+					new_ac *= bytelogscale;
+				}
+
+				# replace minimum score if needed
+				if (!(i in min_score) || $1 < min_score[i]) {
+					min_score[i] = new_ac;
+				}
+			}
+			total_ac += weight[i] * new_ac;
+		}
+
+		if (nbestformat > 0) {
+			total_ac = sprintf("(%f)", total_ac / bytelogscale);
+		}
+		print total_ac hyp;
+
+		hypno ++;
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/combine-rover-controls.gawk
+++ b/language_model/srilm-1.7.3/utils/src/combine-rover-controls.gawk
@@ -0,0 +1,163 @@
+#!/usr/local/bin/gawk -f
+#
+# combine-rover-controls --
+#	combined several rover control files for system combination
+#	(may be used recursively)
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/combine-rover-controls.gawk,v 1.7 2017/08/16 06:34:16 stolcke Exp $
+#
+
+function process_rover_control(file, weight, pscale) {
+
+	dir = file;
+	sub("/[^/]*$", "", dir);
+	if (file == dir) {
+		dir = "";
+	}
+
+	while ((status = (getline < file)) > 0) {
+
+		if (NF == 0) continue;
+
+		# skip comment line
+		if (/^##/) continue;
+
+		if (!keep_paths) {
+		    # deal with relatve directories in rover-control file:
+		    # prepend rover-control directory path
+		    if ($1 !~ /^\// && dir != "") {
+			$1 = dir "/" $1;
+		    }
+		}
+
+		if ($3 == "+") {
+		    system_id = system_id $1 " " $2 " +\n";
+		} else {
+		    nsystems += 1;
+
+		    # handle missing lmw and wtw and system weights
+		    if ($2 == "") $2 = 8;
+		    if ($3 == "") $3 = 0;
+		    if ($4 == "") $4 = 1;
+
+		    # missing nbest depth limit
+		    if ($5 == "") nbest_depth[nsystems] = 0;
+		    else nbest_depth[nsystems] = $5;
+
+		    # override posterior scale if specified
+		    if (pscale) system_pscale[nsystems] = pscale;
+		    else system_pscale[nsystems] = $6
+
+		    system_id = system_id $1 " " $2 " " $3;
+
+		    # see if this system has appeared before
+		    if (system_id in system_index) {
+			# merge system weights
+			# ensuring weight tying spec is compatible
+			if ($4 == "=") {
+			    if (system_weight[system_index[system_id]] != "=") {
+				print "cannot combine weight tying" > "/dev/stderr";
+				exit(1);
+			    }
+			} else {
+			    if (system_weight[system_index[system_id]] == "=") {
+				print "cannot combine weight tying" > "/dev/stderr";
+				exit(1);
+			    }
+			    system_weight[system_index[system_id]] += $4 * weight;
+			}
+
+			# skip the duplicate system
+			nsystems -= 1;
+		    } else {
+			# divide system weight by total number of input files
+			# but preserve weight tying info
+			if ($4 == "=") {
+			    system_weight[nsystems] = $4;
+			} else {
+			    system_weight[nsystems] = $4 * weight;
+			}
+
+			system_dirs_weights[nsystems] = system_id;
+
+			system_index[system_id] = nsystems;
+		    }
+
+		    system_id = "";
+		}
+	}
+
+	if (status < 0) {
+		print file ": " ERRNO > "/dev/stderr";
+		exit(1);
+	}
+	close(file);
+
+	return;
+}
+
+BEGIN {
+	arg_offset = 0;
+	ninputs = ARGC - 1;
+	nsystems = 0;
+
+	while (1) {
+	    if (ARGV[arg_offset+1] ~ /^lambda=/) {
+		lambda = substr(ARGV[arg_offset+1], length("lambda")+2);
+		ninputs -= 1;
+		arg_offset += 1;
+	    } else if (ARGV[arg_offset+1] ~ /^postscale=/) {
+		postscale = substr(ARGV[arg_offset+1], length("postscale")+2);
+		ninputs -= 1;
+		arg_offset += 1;
+	    } else if (ARGV[arg_offset+1] ~ /^norm=/) {
+		norm_weights = substr(ARGV[arg_offset+1], length("norm")+2);
+		ninputs -= 1;
+		arg_offset += 1;
+	    } else if (ARGV[arg_offset+1] ~ /^keeppaths=/) {
+		keep_paths = substr(ARGV[arg_offset+1], length("keeppaths")+2);
+		ninputs -= 1;
+		arg_offset += 1;
+	    } else {
+		break;
+	    }
+	}
+
+	if (ninputs < 1) {
+	    print "usage: " ARGV[0] " [lambda=WEIGHTS] [postscale=S] ROVER-CTRL1 ROVER-CTRL2 ..." \
+			    >> "/dev/stderr";
+	    exit(2);
+	}
+
+        # initialize priors from lambdas
+        nlambdas = split(lambda, lambdas);
+        lambda_sum = 0.0;
+        for (i = 1; i <= nlambdas; i ++) {
+                lambda_sum += lambdas[i];
+        }
+        # fill in the missing lambdas with uniform values
+        for (i = nlambdas + 1; i <= ninputs; i ++) {
+                lambdas[i] = (1 - lambda_sum)/(ninputs - nlambdas);
+        }
+
+	for (i = 1; i <= ninputs; i ++) {
+	    process_rover_control(ARGV[arg_offset + i], lambdas[i], postscale);
+	}
+
+	if (norm_weights) {
+	    weight_sum = 0;
+	    for (i = 1; i <= nsystems; i ++) {
+		weight_sum += system_weight[i];
+	    }
+	    for (i = 1; i <= nsystems; i ++) {
+		system_weight[i] /= weight_sum;
+	    }
+	}
+
+	for (i = 1; i <= nsystems; i ++) {
+	    print system_dirs_weights[i], system_weight[i], nbest_depth[i], system_pscale[i];
+	}
+
+	exit(0);
+}
+
--- a/language_model/srilm-1.7.3/utils/src/compare-ppls.gawk
+++ b/language_model/srilm-1.7.3/utils/src/compare-ppls.gawk
@@ -0,0 +1,92 @@
+#!/usr/local/bin/gawk -f
+#
+# compare-ppls --
+#	Compare two LMs for significant differences in probabilities
+#	The probabilities calculated for the test set words are ranked
+#	pairwise, as appropriate for submitting the result a sign test.
+#
+# usage: compare-ppls [mindelta=d] pplout1 pplout2
+#
+# where pplout1, pplout2 is the output of ngram -debug 2 -ppl for the two
+# models.  d is the minimum difference of logprobs for two probs to 
+# be considered different.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compare-ppls.gawk,v 1.6 2014-07-03 05:57:09 stolcke Exp $
+#
+function abs(x) {
+	return (x < 0) ? -x : x;
+}
+BEGIN {
+	sampleA_no = 0;
+	sampleB_no = 0;
+	mindelta = 0;
+	verbose = 0;
+	signif = 0;
+
+	diff_sum = 0;
+	diff_squared_sum = 0;
+
+	logINF = -100000;
+}
+FNR == 1 {
+	if (!readingA) {
+		readingA = 1;
+	} else {
+		readingA = 0;
+	}
+}
+readingA && $1 == "p(" {
+	if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) prob = logINF;
+	else prob = $10;
+
+	sampleA[sampleA_no ++] = prob;
+}
+!readingA && $1 == "p(" {
+	if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) prob = logINF;
+	else prob = $10;
+
+	if (sampleB_no > sampleA_no) {
+		printf "sample B contains more data than sample A" >> "/dev/stderr";
+		exit(1);
+	}
+	
+	diff = sampleA[sampleB_no] - prob;
+
+	if (abs(diff) <= mindelta) {
+	    equal ++;
+	} else {
+	    diff_sum += diff;
+	    diff_squared_sum += diff * diff;
+
+	    if (diff < 0) {
+		    if (verbose) {
+			    print;
+		    }
+		greater ++;
+	    }
+	}
+
+	sampleB_no ++;
+}
+END {
+	if (sampleB_no < sampleA_no) {
+		printf "sample B contains less data than sample A" >> "/dev/stderr";
+	print sampleB_no, sampleA_no;
+		exit(1);
+	}
+
+	mean_diff = diff_sum / sampleA_no;
+	mean_sq_error = diff_squared_sum / sampleA_no - mean_diff * mean_diff;
+	stdev = sqrt(mean_sq_error);
+
+	printf "total %d, equal %d, different %d, greater %d\n", \
+			sampleB_no, equal, sampleB_no - equal, greater;
+	printf "meandiff %g, mse %g, stdev %g\n", \
+			mean_diff, mean_sq_error, stdev;
+
+	if (signif) {
+	    printf "significance:\n";
+	    less = sampleB_no - equal - greater;
+	    system("cumbin " (less+greater) " " (less>greater ? less : greater));
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/compare-sclite
+++ b/language_model/srilm-1.7.3/utils/src/compare-sclite
@@ -0,0 +1,131 @@
+#!/bin/sh
+#
+# compare-sclite --
+#	compare sclite word error sentence-by-sentence
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compare-sclite,v 1.26 2017/08/12 05:48:34 stolcke Exp $
+#
+
+# enforce proper sorting order
+LC_COLLATE=C
+export LC_COLLATE
+
+if [ $# -lt 3 ]; then
+	echo "usage: $0 [-v] -h1 hyps1 -h2 hyps2 -r refs [-S id-subset] [-M|-multiwords] [sclite-options ...]" >&2
+	echo "   or  $0 hyps1 hyps2 refs" >&2
+	exit 2
+elif [ $# -eq 3 ]; then
+	# old syntax
+	hypsA=${1}
+	hypsB=${2}
+	refs=${3}
+else
+	# parse arguments
+	while [ $# -gt 0 ]; do
+		case "$1" in
+		-r)	refs=$2; shift ;;
+		-h1)	hypsA=$2; shift ;;
+		-h2)	hypsB=$2; shift ;;
+		-S)	options="$options -S $2"; shift ;;
+		*)	options="$options $1" ;;
+		esac
+		shift
+	done
+fi
+
+tmpdir=${TMPDIR-/tmp}
+pralignA=pralignA$$
+pralignB=pralignB$$
+subset="$tmpdir/subset$$"
+
+trap '/bin/rm -f $tmpdir/$pralignA.pra $tmpdir/$pralignB.pra $subset.*' 0 1 2 13 15
+
+set -e
+
+#
+# use the intersection of the too hyp sets and (if specified) the -S set
+#
+case "$hypsA" in
+*.ctm)	case "$hypsB" in
+	*.ctm)	${GAWK-gawk} '{ print $1 "_" $2 }' < "$hypsA" | sort -u > $subset.A
+		${GAWK-gawk} '{ print $1 "_" $2 }' < "$hypsB" | sort -u > $subset.B
+		;;
+	*)	echo "both hyps must be in same format" >&2
+		exit 2
+		;;
+	esac
+	;;
+*)	case "$hypsB" in
+	*.ctm)	echo "both hyps must be in same format" >&2
+		exit 2
+		;;
+	*)	${GAWK-gawk} '{ print $1 }' < "$hypsA" | sort -u > $subset.A 
+		${GAWK-gawk} '{ print $1 }' < "$hypsB" | sort -u > $subset.B
+		;;
+	esac
+	;;
+esac
+
+comm -12 $subset.A $subset.B  > $subset.AB
+options="$options -S $subset.AB"
+
+#
+# generate alignments for the two hyp sets
+#
+compute-sclite -h "$hypsA" -r "$refs" $options -O $tmpdir -n $pralignA -o pralign
+compute-sclite -h "$hypsB" -r "$refs" $options -O $tmpdir -n $pralignB -o pralign
+
+#
+# compute error totals by utterance and compare
+#
+${GAWK-gawk} '
+BEGIN {
+	less = greater = equal = 0;
+}
+$1 == "id:" {
+	sentid = $2;
+	sub("^\\(", "", sentid);
+	sub("\\)$", "", sentid);
+	next;
+}
+$1 == "Scores:" {
+	corr = $6;
+	subs = $7;
+	dels = $8;
+	inss = $9;
+
+	words = corr + subs + dels;
+	errs = subs + dels + inss;
+
+	if (errors[sentid] == "") {
+		errors[sentid] = errs;
+		total_wordsA += words;
+		total_errsA += errs
+		total_sentsA ++;
+	} else {
+		if (errs > errors[sentid]) greater++;
+		else if (errs < errors[sentid]) less++;
+		else equal++;
+		total_wordsB += words;
+		total_errsB += errs;
+		total_sentsB ++;
+	}
+        next;
+}
+END {
+	werA = (total_wordsA > 0 ? total_errsA/total_wordsA * 100 : 0);
+	werB = (total_wordsB > 0 ? total_errsB/total_wordsB * 100 : 0);
+
+	printf "result 1: %d errors (%.2f%%), %d words, %d sentences\n", \
+		    total_errsA, werA, total_wordsA, total_sentsA;
+	printf "result 2: %d errors (%.2f%%), %d words, %d sentences\n", \
+		    total_errsB, werB, total_wordsB, total_sentsB;
+        printf "less %d, greater %d, equal %d, different %d (%+.2f%%)\n", \
+                 less, greater, equal, less + greater, werB - werA;
+	if (less + greater > 0) {
+	    printf "significance:\n"
+	    system("cumbin " (less+greater) " " (less>greater ? less : greater));
+	}
+}
+' $tmpdir/$pralignA.pra $tmpdir/$pralignB.pra
+
--- a/language_model/srilm-1.7.3/utils/src/compute-best-mix.gawk
+++ b/language_model/srilm-1.7.3/utils/src/compute-best-mix.gawk
@@ -0,0 +1,181 @@
+#!/usr/local/bin/gawk -f
+#
+# compute-best-mix --
+#	Compute the best mixture weight (-lambda) for interpolating N
+#	LMs.
+#
+# usage: compute-best-mix [lambda="l1 l2 ..."] [precision=p] pplout1 pplout2 ...
+#j
+# where pplout1, pplout2, ... is the output of ngram -debug 2 -ppl for the 
+# models.  li are initial guesses at the mixture weights, and p is the
+# precision with which the best lambda vector is to be found.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-mix.gawk,v 1.13 2017/12/22 01:34:49 stolcke Exp $
+#
+BEGIN {
+	verbose = 0;
+
+	lambda = "0.5";
+	precision = 0.001;
+	M_LN10 = 2.30258509299404568402;	# from <math.h>
+
+	logINF = -320;
+
+	unk = "<unk>";
+}
+function abs(x) {
+	return (x < 0) ? -x : x;
+}
+function log10(x) {
+	return log(x) / M_LN10;
+}
+function exp10(x) {
+	if (x < logINF) {
+		return 0;
+	} else {
+		return exp(x * M_LN10);
+	}
+}
+function addlogs(x,y) {
+    if (x<y) {
+	temp = x; x = y; y = temp;
+    }
+    return x + log10(1 + exp10(y - x));
+}
+
+function print_vector(x, n) {
+	result = "(" x[1];
+	for (k = 2; k <= n; k++) {
+		result = result " " x[k];
+	}
+	return result ")"
+}
+
+function print_vector_pairwise(x, n) {
+	total_lambda = x[1];
+	result = "(" 1;
+	for (k = 2; k <= n; k++) {
+		total_lambda += x[k];
+		result = result " " x[k]/total_lambda;
+	}
+	return result ")"
+}
+
+FNR == 1 {
+	nfiles ++;
+}
+$1 == "p(" {
+
+	word = $2;
+        # Canonicalize input to have at most one representative context word; 
+        sub("[|] [^)]*)", "| X )");
+        $0 = $0;
+
+	if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) {
+	    prob = logINF;
+	} else {
+	    prob = $10;
+	}
+
+        # If a count is given.
+	if ($11 ~ /^[*]/) {
+	    count = substr($11,2);
+	} else {
+	    count = 1;
+	}
+
+	sample_no = ++ nsamples[nfiles];
+	samples[nfiles " " sample_no] = prob;
+	counts[sample_no] = count;
+
+	if (sample_no in words) {
+	    if (word != words[sample_no] && word != unk && words[sample_no] != unk) {
+		print "warning: word mismatch in file " FILENAME ", token " sample_no \
+			": " word " != " words[sample_no] > "/dev/stderr";
+	    }
+	} else {
+	    words[sample_no] = word;
+	}
+}
+END {
+	for (i = 2; i <= nfiles; i ++) {
+		if (nsamples[i] != nsamples[1]) {
+			printf "mismatch in number of samples (%d != %d)", \
+				nsamples[1], nsamples[i] >> "/dev/stderr";
+			exit(1);
+		}
+	}
+
+	last_prior = 0.0;
+
+	# initialize priors from lambdas
+	nlambdas = split(lambda, lambdas);
+	lambda_sum = 0.0;
+	for (i = 1; i <= nlambdas; i ++) {
+		priors[i] = lambdas[i];
+		lambda_sum += lambdas[i];
+	}
+	# fill in the missing lambdas
+	for (i = nlambdas + 1; i <= nfiles; i ++) {
+		priors[i] = (1 - lambda_sum)/(nfiles - nlambdas);
+	}
+
+	iter = 0;
+	have_converged = 0;
+	while (!have_converged) {
+	    iter ++;
+
+	    num_oovs = num_words = 0;
+	    delete post_totals;
+	    log_like = 0;
+
+	    for (j = 1; j <= nsamples[1]; j ++) {
+
+		all_inf = 1;
+		for (i = 1; i <= nfiles; i ++) {
+			sample = samples[i " " j];
+			logpost[i] = log10(priors[i]) + sample;
+			all_inf = all_inf && (sample == logINF);
+			if (i == 1) {
+				logsum = logpost[i];
+			} else {
+				logsum = addlogs(logsum, logpost[i]);
+			}
+		}
+
+		# skip OOV words
+		if (all_inf) {
+   		        num_oovs += counts[j];
+			continue;
+		}
+
+		num_words += counts[j];
+		log_like += logsum * counts[j];
+
+		for (i = 1; i <= nfiles; i ++) {
+			post_totals[i] += exp10(logpost[i] - logsum) * counts[j];
+		}
+	    }
+
+	    printf "iteration %d, lambda = %s, ppl = %g\n", \
+		    iter, print_vector(priors, nfiles), \
+		    exp10(-log_like/num_words) >> "/dev/stderr";
+	    fflush();
+
+	
+	    have_converged = 1;
+	    for (i = 1; i <= nfiles; i ++) {
+		last_prior = priors[i];
+		priors[i] = post_totals[i]/num_words;
+
+		if (abs(last_prior - priors[i]) > precision) {
+			have_converged = 0;
+		}
+	    }
+	}
+
+	printf "%d non-oov words, best lambda %s\n", 
+			num_words, print_vector(priors, nfiles);
+	printf "pairwise cumulative lambda %s\n", 
+			print_vector_pairwise(priors, nfiles);
+}
--- a/language_model/srilm-1.7.3/utils/src/compute-best-rover-mix.gawk
+++ b/language_model/srilm-1.7.3/utils/src/compute-best-rover-mix.gawk
@@ -0,0 +1,166 @@
+#!/usr/local/bin/gawk -f
+#
+# compute-best-rover-mix --
+#	Compute the best mixture weight for combining multiple sausages
+#
+# usage: compute-best-rover-mix [lambda="l1 l2 ..."] [addone=N] [precision=p] nbest-rover-ref-posteriors-output
+#
+# where the input is the output of nbest-rover -write-ref-posteriors .
+# li are initial guesses at the mixture weights, and p is the
+# precision with which the best lambda vector is to be found.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-rover-mix.gawk,v 1.6 2016-12-10 07:06:41 stolcke Exp $
+#
+BEGIN {
+	verbose = 0;
+
+	lambda = "0.5";
+	addone = 0;
+	precision = 0.001;
+	M_LN10 = 2.30258509299404568402;	# from <math.h>
+
+	logINF = -320;
+
+	zero_probs = 0;
+}
+function abs(x) {
+	return (x < 0) ? -x : x;
+}
+function log10(x) {
+	return log(x) / M_LN10;
+}
+function exp10(x) {
+	if (x < logINF) {
+		return 0;
+	} else {
+		return exp(x * M_LN10);
+	}
+}
+function addlogs(x,y) {
+    if (x<y) {
+	temp = x; x = y; y = temp;
+    }
+    return x + log10(1 + exp10(y - x));
+}
+
+function print_vector(x, n) {
+	result = x[1];
+	for (k = 2; k <= n; k++) {
+		result = result " " x[k];
+	}
+	return result;
+}
+
+{
+	nsystems = NF - 4;
+
+	if ($4 == 0) {
+		zero_probs ++;
+	} else {
+		sample_no ++;
+
+		for (i = 1; i <= nsystems; i++) {
+			samples[i " " sample_no] = $(i + 4);
+		}
+	}
+}
+	
+END {
+	last_prior = 0.0;
+
+	# initialize priors from lambdas
+	nlambdas = split(lambda, lambdas);
+	lambda_sum = 0.0;
+	for (i = 1; i <= nlambdas; i ++) {
+		priors[i] = lambdas[i];
+		lambda_sum += lambdas[i];
+	}
+	# fill in the missing lambdas
+	for (i = nlambdas + 1; i <= nsystems; i ++) {
+		priors[i] = (1 - lambda_sum)/(nsystems - nlambdas);
+	}
+
+	# set up weight tying - assign input systems (weights) to tying bins
+	if (tying) {
+		ntying = split(tying, tying_bins);
+		for (i = 1; i <= ntying && i <= nsystems; i ++) {
+		    this_bin = int(tying_bins[i]);
+		    if (this_bin <= 0) {
+			print "invalid tying bin: " tying_bins[i];
+			exit(1);
+		    }
+		    binfor[i] = this_bin;
+		    weights_in_bin[this_bin] += 1;
+
+		    if (this_bin > nbins) nbins = this_bin;
+		}
+	} else {
+		i = 1;
+		nbins = 0;
+	}
+	# assign unique bins for weights not covered in tying argument string
+	for ( ; i <= nsystems; i ++) {
+	    binfor[i] = ++nbins;
+	    weights_in_bin[nbins] = 1;
+	}
+		
+
+	iter = 0;
+	have_converged = 0;
+	while (!have_converged) {
+	    iter ++;
+
+	    num_words = 0;
+	    delete post_totals;
+	    log_like = 0;
+
+	    for (j = 1; j <= sample_no; j ++) {
+
+		all_inf = 1;
+		for (i = 1; i <= nsystems; i ++) {
+			sample = log10(samples[i " " j]);
+			logpost[i] = log10(priors[i]) + sample;
+			all_inf = all_inf && (sample == logINF);
+			if (i == 1) {
+				logsum = logpost[i];
+			} else {
+				logsum = addlogs(logsum, logpost[i]);
+			}
+		}
+
+		# skip OOV words
+		if (all_inf) {
+			continue;
+		}
+
+		num_words ++;
+		log_like += logsum;
+
+		# total up the posteriors for each weight bin
+		for (i = 1; i <= nsystems; i ++) {
+			post_totals[binfor[i]] += exp10(logpost[i] - logsum);
+		}
+	    }
+	    printf "iteration %d, lambda = %s, ppl = %g\n", \
+		    iter, print_vector(priors, nsystems), \
+		    exp10(-log_like/num_words) >> "/dev/stderr";
+	    fflush();
+
+	
+	    have_converged = 1;
+	    for (i = 1; i <= nsystems; i ++) {
+		last_prior = priors[i];
+		priors[i] = (post_totals[binfor[i]]/weights_in_bin[binfor[i]] + addone)/(num_words + nsystems * addone);
+
+		if (abs(last_prior - priors[i]) > precision) {
+			have_converged = 0;
+		}
+	    }
+	}
+
+	weights = print_vector(priors, nsystems);
+	printf "%d alignment positions, best lambda (%s)\n", num_words, weights;
+	if (write_weights) {
+		print weights > write_weights;
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/compute-best-sentence-mix.gawk
+++ b/language_model/srilm-1.7.3/utils/src/compute-best-sentence-mix.gawk
@@ -0,0 +1,159 @@
+#!/usr/local/bin/gawk -f
+#
+# compute-best-sentence-mix --
+#	Compute the best sentence-level mixture weight for interpolating N
+#	LMs.
+#
+# usage: compute-best-sentence-mix [lambda="l1 l2 ..."] [addone=N] [precision=p] pplout1 pplout2 ...
+#
+# where pplout1, pplout2, ... is the output of ngram -debug 1 -ppl for the 
+# models.  li are initial guesses at the mixture weights, and p is the
+# precision with which the best lambda vector is to be found.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-sentence-mix.gawk,v 1.4 2016/06/01 20:20:38 stolcke Exp $
+#
+BEGIN {
+	verbose = 0;
+
+	lambda = "0.5";
+	addone = 0;
+	precision = 0.001;
+	M_LN10 = 2.30258509299404568402;	# from <math.h>
+
+	logINF = -320;
+}
+function abs(x) {
+	return (x < 0) ? -x : x;
+}
+function log10(x) {
+	return log(x) / M_LN10;
+}
+function exp10(x) {
+	if (x < logINF) {
+		return 0;
+	} else {
+		return exp(x * M_LN10);
+	}
+}
+function addlogs(x,y) {
+    if (x<y) {
+	temp = x; x = y; y = temp;
+    }
+    return x + log10(1 + exp10(y - x));
+}
+
+function print_vector(x, n) {
+	result = "(" x[1];
+	for (k = 2; k <= n; k++) {
+		result = result " " x[k];
+	}
+	return result ")"
+}
+
+FNR == 1 {
+	nfiles ++;
+	num_words = 0;
+	num_sentences = 0;
+}
+
+# 1 sentences, 6 words, 0 OOVs
+/^1 sentences, [0-9]* words, [0-9]* OOVs/ {
+	# exclude OOVs
+	num_words += $3 - $5;
+	expect_logprob = 1;
+}
+
+# 0 zeroprobs, logprob= -22.9257 ppl= 1884.06 ppl1= 6621.32
+/^[0-9]* zeroprobs, logprob= / && expect_logprob {
+
+	# exclude zero prob words
+	num_words -= $1;
+	num_sentences += 1;
+
+	if ($4 ~ /-[Ii]nf|-1\.#INF/) {
+	    prob = logINF;
+	} else {
+	    prob = $4;
+	}
+
+	sample_no = ++ nsamples[nfiles];
+	samples[nfiles " " sample_no] = prob;
+
+	expect_logprob = 0;
+}
+END {
+	for (i = 2; i <= nfiles; i ++) {
+		if (nsamples[i] != nsamples[1]) {
+			printf "mismatch in number of samples (%d != %d)", \
+				nsamples[1], nsamples[i] >> "/dev/stderr";
+			exit(1);
+		}
+	}
+
+	last_prior = 0.0;
+
+	# initialize priors from lambdas
+	nlambdas = split(lambda, lambdas);
+	lambda_sum = 0.0;
+	for (i = 1; i <= nlambdas; i ++) {
+		priors[i] = lambdas[i];
+		lambda_sum += lambdas[i];
+	}
+	# fill in the missing lambdas
+	for (i = nlambdas + 1; i <= nfiles; i ++) {
+		priors[i] = (1 - lambda_sum)/(nfiles - nlambdas);
+	}
+
+	iter = 0;
+	have_converged = 0;
+	while (!have_converged) {
+	    iter ++;
+
+	    delete post_totals;
+	    log_like = 0;
+
+	    for (j = 1; j <= nsamples[1]; j ++) {
+
+		all_inf = 1;
+		for (i = 1; i <= nfiles; i ++) {
+			sample = samples[i " " j];
+			logpost[i] = log10(priors[i]) + sample;
+			all_inf = all_inf && (sample == logINF);
+			if (i == 1) {
+				logsum = logpost[i];
+			} else {
+				logsum = addlogs(logsum, logpost[i]);
+			}
+		}
+
+		# skip OOV words
+		if (all_inf) {
+			continue;
+		}
+
+		log_like += logsum;
+
+		for (i = 1; i <= nfiles; i ++) {
+			post_totals[i] += exp10(logpost[i] - logsum);
+		}
+	    }
+	    printf "iteration %d, lambda = %s, ppl = %g\n", \
+		    iter, print_vector(priors, nfiles), \
+		    exp10(-log_like/(num_words + num_sentences)) \
+							>> "/dev/stderr";
+	    fflush();
+	
+	    have_converged = 1;
+	    for (i = 1; i <= nfiles; i ++) {
+		last_prior = priors[i];
+		priors[i] = (post_totals[i] + addone)/(num_sentences + nfiles * addone);
+
+		if (abs(last_prior - priors[i]) > precision) {
+			have_converged = 0;
+		}
+	    }
+	}
+
+	printf "%d sentences, %d non-oov words, best lambda %s\n", 
+			num_sentences, num_words, print_vector(priors, nfiles);
+}
--- a/language_model/srilm-1.7.3/utils/src/compute-oov-rate.gawk
+++ b/language_model/srilm-1.7.3/utils/src/compute-oov-rate.gawk
@@ -0,0 +1,81 @@
+#!/usr/local/bin/gawk -f
+#
+# compute-oov-rate --
+#	Compute OOV word rate from a vocabulary and a unigram count file
+#
+# usage: compute-oov-rate vocab countfile ...
+#
+# Assumes unigram counts do not have repeated words.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compute-oov-rate.gawk,v 1.10 2018/01/24 03:35:38 stolcke Exp $
+#
+
+BEGIN {
+	# high bit characters also detect multibyte characters
+	letter = "[[:alpha:]\x80-\xFF]";
+	if ("x" !~ letter) letter = "[A-Za-z\x80-\xFF]";
+}
+
+# Read vocab
+#
+ARGIND == 1 {
+	vocab[$1] = 1;
+}
+
+function is_fragment(word) {
+	return word ~ (letter "-$") || word ~ ("^-" letter);
+}
+
+#
+# Read counts
+#
+ARGIND > 1 {
+	if ($1 == "<s>" || $1 == "</s>" || $1 == "-pau-") {
+		next;
+	}
+
+	total_count += $2;
+	total_types ++;
+
+	if (!vocab[$1]) {
+		oov_count += $2;
+		oov_types ++; 
+
+		if (debug) {
+		    print "OOV: " $1, $2 > "/dev/stderr";
+		}
+
+		if (!is_fragment($1)) {
+		    if (write_oov_words) {
+			    print > write_oov_words;
+		    }
+		} else {
+		    if (write_oov_frags) {
+			    print > write_oov_frags;
+		    }
+		}
+	}
+
+	if (!is_fragment($1)) {
+		total_nofrag_count += $2;
+		total_nofrag_types ++;
+
+		if (!vocab[$1]) {
+			oov_nofrag_count += $2;
+			oov_nofrag_types ++; 
+		}
+	}
+
+}
+END {
+	printf "OOV tokens: %d / %d (%.2f%%) ", \
+		oov_count, total_count, total_count == 0 ? 0 : 100 * oov_count/total_count;
+	printf "excluding fragments: %d / %d (%.2f%%)\n", \
+		oov_nofrag_count, total_nofrag_count, \
+		total_nofrag_count == 0 ? 0 : 100 * oov_nofrag_count/total_nofrag_count;
+	printf "OOV types: %d / %d (%.2f%%) ", \
+		oov_types, total_types, total_types == 0 ? 0 : 100 * oov_types/total_types;
+	printf "excluding fragments: %d / %d (%.2f%%)\n", \
+		oov_nofrag_types, total_nofrag_types, \
+		total_nofrag_types == 0 ? 0 : 100 * oov_nofrag_types/total_nofrag_types;
+}
--- a/language_model/srilm-1.7.3/utils/src/compute-sclite
+++ b/language_model/srilm-1.7.3/utils/src/compute-sclite
@@ -0,0 +1,252 @@
+#!/bin/sh
+#
+# compute-sclite --
+#	compute word error rate from a sentid hyp file and a sentid reference
+#	file, using the NIST 'sclite' program
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compute-sclite,v 1.49 2016/09/23 20:05:51 stolcke Exp $
+#
+
+# enforce proper sorting order
+LC_COLLATE=C
+export LC_COLLATE
+
+reject="@reject@"
+
+sclite=sclite
+
+subsets=
+remove_periods=
+format_sentids=1
+
+if [ $# -lt 2 ]; then
+	echo "usage: $0 [-v] -h hyps -r refs [-S id-subset] [-M|-multiwords] [-noperiods] [-g glm-file] [sclite-options ...]" >&2
+	echo "   or  $0 hyps refs" >&2
+	exit 2
+elif [ $# -eq 2 ]; then
+	# old syntax
+	hyps=${1}
+	refs=${2}
+else
+	# parse arguments
+	while [ $# -gt 0 ]; do
+		case "$1" in
+		-v)	verbose=1 ;;
+		-r)	refs=$2; shift ;;
+		-h)	hyps="$hyps $2"
+			name=`basename $2`
+			shift ;;
+		-S)	subsets="$subsets $2"; shift ;;
+		-M|-multiwords)
+			multiwords=1 ;;
+		-noperiods)
+			remove_periods=1 ;;
+		-H)	remove_hesitations=1 ;;
+                -keep_bracketed) 
+			keep_bracketed=1 ;;
+		-R)	reject="<>" ;;
+		-g)	glmfile=$2; shift ;;
+		-s)	case_sensitive=1 ;;
+		-overlap-limit)
+			options="$options $1 $2"
+			sclite=asclite
+			shift;;
+		-raw-sentids)
+			format_sentids=0
+			;;
+		*)	options="$options $1" ;;
+		esac
+		shift
+	done
+fi
+
+if [ -n "$case_sensitive" ]; then
+    filter_options="-s";
+    options="$options -s";
+fi 
+
+tmpdir=${TMPDIR-/tmp}
+sentids="$tmpdir/ce.sentids$$"
+speakers="$tmpdir/ce.speakers$$"
+sortedrefs="$tmpdir/ce.refs$$"
+sortedhyps="$tmpdir/ce.hyps$$"
+ignorehyps="$tmpdir/ce.ign$$"
+
+if [ -z "$verbose" ]; then
+    trap '/bin/rm -f $sentids $speakers $sortedrefs $sortedhyps $ignorehyps' \
+	    0 1 2 13 15
+fi
+
+set -e
+
+multijoin () {
+	if [ $# -eq 1 ]; then
+	    cat $1
+	else
+	    join $1 $2 | { shift; shift; multijoin - "$@"; }
+	fi
+}
+
+#
+# extract and sort sentids from hyps
+# (for CTM hyps these are just waveform/channel labels)
+#
+
+case "$hyps" in 
+*.ctm)
+     cat $hyps | \
+	${GAWK-gawk} '!/^;;/ && $7 != "non-lex" && $7 != "fp" { print $1 "_" $2 }' ;;
+*)   cat $hyps | ${GAWK-gawk} '{ print $1 }' ;;
+esac | \
+sort | \
+multijoin - $subsets > $sentids
+
+#
+# extract list of "speakers" (waveform/channel labels)
+#
+case "$hyps" in 
+*.ctm)
+     cat $sentids | uniq | tr '[A-Z]' '[a-z]' | sort > $speakers
+     ;;
+*)   sed 's,\([-_][ABab12]\)[-_].*,\1,' $sentids | uniq | \
+				tr '[A-Z]' '[a-z]' | sort > $speakers 
+     ;;
+esac
+
+#
+# extract and sort refs for these sentids
+#
+case "$refs" in
+*.stm) # NIST scoring: 
+    # filter out speakers not occurring in hyp file
+    ${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \
+    sort -k 1,1 -k 5,5n | \
+    join - $speakers | \
+    ${GAWK-gawk} '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \
+    if [ -n "$glmfile" ]; then
+	${GAWK-gawk} '{ gsub("-","_",$1); gsub("-","_",$3); print }' | \
+	csrfilt.sh $filter_options -i stm -t ref -dh $glmfile  
+    else
+	cat
+    fi > $sortedrefs
+    ;;
+*.stm.filt) # NIST scoring with pre-filtered references
+    # filter out speakers not occurring in hyp file
+    ${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \
+    sort -k 1,1 -k 5,5n | \
+    join - $speakers | \
+    ${GAWK-gawk} '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \
+    if [ -n "$glmfile" ]; then
+	${GAWK-gawk} '{ gsub("-","_",$1); gsub("-","_",$3); print }'
+    else
+	cat
+    fi > $sortedrefs
+    ;;
+*)  sort "$refs" | join - $sentids | \
+    ${GAWK-gawk} '{ if (multiwords) for (i = 2; i <= NF; i++) \
+		gsub("_", " ", $i); print }'\
+	    multiwords=$multiwords | \
+    sed -e 's,\[[^]]*\],,g' | \
+    sentid-to-sclite format_sentids=$format_sentids | \
+    if [ -n "$glmfile" ]; then
+	csrfilt.sh $filter_options -i trn -t hyp -dh $glmfile
+    else
+	cat
+    fi   > $sortedrefs
+
+    # find segments to ignore
+    ${GAWK-gawk} 'NF == 2 && tolower($2) == "ignore_time_segment_in_scoring" \
+		{ print $1 }' < $refs | \
+    sort > $ignorehyps
+    ;;
+esac
+
+if [ ! -s $sortedrefs ]; then
+	echo "Filtered references are empty" >&2
+	exit 1
+fi
+
+#
+# sort and condition hyps
+#
+case "$refs" in
+*.stm|*.stm.filt) # NIST scoring
+    # sclite will handle ignored segments
+    case "$hyps" in 
+    *.ctm)
+	cat $hyps | ${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' | \
+	sort -b -k 1,1 -k 2,2 -k 3,3 -k 4,4n | join - $speakers | \
+	${GAWK-gawk} '{ $1 = ""; print }' ;;
+    *)  sort -k 1,1 $hyps | join - $sentids | sentid-to-ctm  ;;
+    esac | \
+    ${GAWK-gawk} '{ # handle new-style CTM format (convert it to old format)
+	    if (NF >= 7) {
+		if ($7 != "lex") next;
+		else $7 = $8 = "";
+	    }
+	    if (remove_periods) gsub("[.]", "", $5);
+	    print;
+	  }' remove_periods=$remove_periods | \
+    if [ -n "$glmfile" ]; then
+	${GAWK-gawk} '{ gsub("-","_",$1); print }' | \
+	csrfilt.sh $filter_options -i ctm -t hyp -dh $glmfile | \
+	if [ -n "$remove_hesitations" ]; then
+	    grep -vi '%HESITATION'
+	else
+	    cat
+	fi
+    else
+	cat
+    fi > $sortedhyps
+    ;;
+*)  # we have to remove ignored segments ourselves
+    sort -k 1,1 $hyps | join - $sentids | join -v 1 - $ignorehyps | \
+     ${GAWK-gawk} '{ if (multiwords) for (i = 2; i <= NF; i++) gsub("_", " ", $i);
+	     if (remove_periods) for (i = 2; i <= NF; i++) gsub("[.]", "", $i);
+	     print }'\
+	    remove_periods=$remove_periods multiwords=$multiwords | \
+     sed -e 's,\[[^]]*\],,g' \
+	    -e "s,$reject,,g" \
+	    -e 's,-pau-,,g' | \
+	if [ -z "$keep_bracketed" ]; then
+            sed -e  's,<[^>]*>,,g'
+        else 
+	    cat
+	fi |\
+     sentid-to-sclite format_sentids=$format_sentids |\
+     if [ -n "$glmfile" ]; then
+	csrfilt.sh $filter_options -i trn -t hyp -dh $glmfile | \
+	if [ -n "$remove_hesitations" ]; then
+	    sed -e 's/\%HESITATION//g' -e 's/\%hesitation//g'
+	else
+	    cat
+	fi
+     else
+	cat
+     fi    > $sortedhyps
+     ;;
+esac
+
+if [ ! -s $sortedhyps ]; then
+	echo "Filtered hypotheses are empty" >&2
+	exit 1
+fi
+
+[ "$verbose" ] && set -x
+
+case $sclite in
+sclite)	options="-n $name $options" ;;
+esac
+
+case "$refs" in
+*.stm|*.stm.filt) # NIST scoring
+    $sclite -f 0 -O . \
+	    -h $sortedhyps ctm $name -r $sortedrefs stm  \
+	    -D $options
+    ;;
+*)  $sclite -f 0 -O . \
+	    -h $sortedhyps trn $name -r $sortedrefs trn  \
+	    -i swb $options
+    ;;
+esac
+
--- a/language_model/srilm-1.7.3/utils/src/compute-sclite-nbest
+++ b/language_model/srilm-1.7.3/utils/src/compute-sclite-nbest
@@ -0,0 +1,153 @@
+#!/bin/sh
+#
+# compute-sclite-nbest --
+#	Compute errors for nbest hypotheses using sclite
+#	for use with nbest-optimize -errors option
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/compute-sclite-nbest,v 1.5 2016/09/23 20:05:51 stolcke Exp $
+# 
+
+usage () {
+	echo "$0 nbest-files output-dir -r refs [-filter script] [sclite-options]"
+}
+
+if [ $# -lt 2 ]; then
+	usage;
+	exit 2
+fi
+
+filter=cat
+nbest_files=$1
+output_dir=$2
+shift; shift
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+    -r)		refs=$2
+		shift; shift
+		;;
+    -filter)	filter="$2"
+		shift; shift
+		;;
+    *)		sclite_options="$sclite_options $1"
+		shift
+		;;
+    esac
+done
+
+if [ -z "$refs" ]; then
+    usage
+    exit 2
+fi
+
+TMPDIR=${TMPDIR-/tmp}
+
+sortedrefs=$TMPDIR/sortedrefs.$$
+nbestrefs=$TMPDIR/nbestrefs.$$
+nbesthyps=$TMPDIR/nbesthyps.$$
+scliteout=$TMPDIR/scliteout.$$
+
+trap "/bin/rm -f $sortedrefs $nbestrefs $nbesthyps $scliteout; exit 1" 1 2 13 15
+
+set -e
+
+sort -k 1,1 $refs > $sortedrefs
+
+> $nbestrefs
+> $nbesthyps
+
+#
+# Prepare hyp and reference files
+#
+cat $nbest_files | \
+sed 's,.*/\(.*\).gz$,\1 &,' | \
+sort -k 1,1 | \
+join - $sortedrefs | \
+while read sentid nbestlist refwords
+do
+    if [ -z "$refwords" ]; then
+	echo "warning: $sentid has no reference" >&2
+	continue
+    fi
+    
+    echo $sentid >&2
+
+    gunzip -cf $nbestlist | \
+    nbest-words | \
+    $filter | \
+    ${GAWK-gawk} \
+	-v nbestrefs=$nbestrefs -v nbesthyps=$nbesthyps \
+	-v outdir=$output_dir \
+	-v sentid=$sentid -v refwords="$refwords" '{
+	    if (refwords == "ignore_time_segment_in_scoring") {
+		# this utterance is to be ignored --
+		# we generate dummy error information directly
+		# nbest-optimize(1) error count format is: wcr wer nsub ndel nins nerr nw
+		print 0, 0, 0, 0, 0, 0, 0 | "gzip > " outdir "/" sentid ".gz";
+	    } else {
+		gsub("<[^ ]*>", "");
+		gsub("-pau-", "");
+
+		hypid = sprintf("%s#%05d", sentid, NR);
+		print hypid, refwords >> nbestrefs;
+		print hypid, $0 >> nbesthyps;
+	    }
+	}'
+done 
+
+if [ -s $nbestrefs ]; then
+    # 
+    # Run the scoring
+    #
+    (set -x; compute-sclite \
+	    -raw-sentids \
+	    $sclite_options \
+	    -O $TMPDIR -l 1000 \
+	    -r $nbestrefs \
+	    -h $nbesthyps \
+	    -o pralign )
+
+    #
+    # Extract error counts from sclite pra output
+    #
+    ${GAWK-gawk} -v outdir=$output_dir '
+    $1 == "id:" {
+	    sentid = $2;
+	    sub("^\\(", "", sentid);
+	    # strip the hyp number
+	    sub("#[0-9]*)$", "", sentid);
+
+	    # sclite lowercases sentids
+	    # Heuristically restore channel letters to uppercase
+	    sub("_a_", "_A_", sentid);
+	    sub("_b_", "_B_", sentid);
+	    sub("-a-", "-A-", sentid);
+	    sub("-b-", "-B-", sentid);
+
+	    if (sentid != last_sentid) {
+		if (outfile) close(outfile);
+		outfile = "gzip > " outdir "/" sentid ".gz"
+		last_sentid = sentid;
+	    }
+
+	    next;
+    }
+    $1 == "Scores:" {
+	    corr = $6;
+	    subs = $7;
+	    dels = $8;
+	    inss = $9;
+
+	    words = corr + subs + dels;
+	    errs = subs + dels + inss;
+	    wer = words > 0 ? errs/words : 0;
+	    # nbest-optimize(1) error count format is: wcr wer nsub ndel nins nerr nw
+	    print words-dels-subs, wer, subs, dels, inss, errs, words | outfile;
+    }
+    END {
+	    if (outfile) close(outfile);
+    }' $nbesthyps.pra
+fi
+
+/bin/rm -f $sortedrefs $nbestrefs $nbesthyps $nbesthyps.pra
--- a/language_model/srilm-1.7.3/utils/src/concat-sausages.gawk
+++ b/language_model/srilm-1.7.3/utils/src/concat-sausages.gawk
@@ -0,0 +1,152 @@
+#!/usr/local/bin/gawk -f
+#
+# concat-sausages --
+#	concatenate a list of sausages into a single word confusion networks
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/concat-sausages.gawk,v 1.1 2019/02/09 07:34:35 stolcke Exp $
+#
+# input format:
+#
+#	name Speech012_apple-iphone-6s-agc_00001330_00010030
+#	numaligns 32
+#	posterior 1
+#	align 0 <s> 1
+#	info 0 <s> 1.33 0.06 0 0 : :
+#	align 1 OK 1
+#	info 1 OK 1.39 0.5 0 0 : :
+#	align 2 *DELETE* 1 I 3.110077054250103e-33 we 3.193624897980025e-52 i 7.615703946522299e-53
+#	info 2 I 1.83 0.06 0 0 : :
+#	info 2 we 1.85 0.06 0 0 : :
+#	info 2 i 1.83 0.06 0 0 : :
+#
+
+BEGIN {
+	name = "";
+	numaligns = 0;
+	posterior = 0;
+	if (posterior_factor == "") {
+	    posterior_factor = 1;
+	}
+
+	sent_start = "<s>";
+	sent_end = "</s>";
+
+	epsilon = 1e-05;
+}
+
+function abs(x) {
+	return x < 0 ? -x : x;
+}
+
+function process_sausage(file, remove_start, remove_end) {
+
+	if (file ~ /.*\.gz$|.*\.Z/) {
+	    input = "exec gunzip -c " file;
+	} else {
+	    input = "exec cat " file;
+	}
+
+	while ((status = (input | getline)) > 0) {
+
+	    if ($1 == "name") {
+		if (output_name != "") {
+			name = output_name;
+		} else if (name == "") {
+			name = $2;
+		} else {
+			name = name "+" $2
+		}
+
+	    } else if ($1 == "posterior") {
+		if (posterior != 0 && abs($2 - posterior) > epsilon) {
+		    print file ": incompatible posterior: " $2 > "/dev/stderr"
+		    exit(1);
+		} else {
+		    posterior = $2;
+#		    if (posterior_factor != 1) {
+#			posterior *= posterior_factor;
+#		    }
+		}
+	    } else if ($1 == "numaligns") {
+		# offset for renumbered alignments
+		start_alignment = numaligns;
+	    } else if ($1 == "align") {
+
+		$2 = $2 + start_alignment;
+
+		if (posterior_factor != 1 && $3 != sent_start && $3 != sent_end) {
+		    for (i = 4; i <= NF; i += 2) {
+			$i *= posterior_factor;
+		    }
+		}
+
+		#
+		# remove alignment positions that are just for 
+		# start/end sentence tags, if so desired
+		# 
+		if (NF == 4 && $3 == sent_start && remove_start) {
+		    start_alignment --;
+		    ;
+		} else if (NF == 4 && $3 == sent_end && remove_end) {
+		    start_alignment --;
+		    ;
+		} else {
+		    alignments[$2] = $0;
+
+		    if ($2 + 1 > numaligns) {
+			numaligns = $2 + 1;
+		    }
+		}
+	    } else if ($1 == "info") {
+
+		$2 = $2 + start_alignment;
+
+		if (!($2 in info)) {
+		    info[$2] = $0;
+		} else {
+		    info[$2] = info[$2] "\n" $0;
+		}
+	    } else if ($1 == "time") {
+		; 	# ignore
+	    } else {
+		print file ": unknown keyword: " $1 > "/dev/stderr";
+		exit(1);
+	    }
+	}
+
+	if (status < 0) {
+		print "error opening " file >> "/dev/stderr";
+	}
+
+	close(input);
+}
+
+function output_sausage() {
+	print "name", name;
+	print "numaligns", numaligns;
+	print "posterior", posterior;
+	
+	for (i = 0; i < numaligns; i ++) {
+		if (i in alignments) {
+			print alignments[i];
+			if (i in info) {
+			    print info[i];
+			}
+		}
+	}
+}
+
+BEGIN {
+	if (ARGC < 2) {
+	    print "usage: " ARGV[0] " SAUSAGE1 SAUSAGE2 ..." \
+			    >> "/dev/stderr";
+	    exit(2);
+	}
+
+	for (arg = 1; arg < ARGC; arg ++) {
+	    process_sausage(ARGV[arg], arg > 1, arg < ARGC-1);
+	}
+	
+	output_sausage();
+}
+
--- a/language_model/srilm-1.7.3/utils/src/context-ngrams.gawk
+++ b/language_model/srilm-1.7.3/utils/src/context-ngrams.gawk
@@ -0,0 +1,13 @@
+#!/usr/local/bin/gawk -f
+#
+# context-ngrams -- 
+#	Extract counts corresponding to ngram contexts
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/context-ngrams.gawk,v 1.1 2008/09/30 03:54:05 stolcke Exp $
+#
+
+NF > 2 {
+	$(NF-1) = "";
+	print $0;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/continuous-ngram-count.gawk
+++ b/language_model/srilm-1.7.3/utils/src/continuous-ngram-count.gawk
@@ -0,0 +1,35 @@
+#!/usr/local/bin/gawk -f
+#
+# continuous-ngram-count --
+#	Generate ngram counts ignoring line breaks 
+#	
+# usage: continous-ngram-count order=ORDER textfile | ngram-count -read -
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/continuous-ngram-count.gawk,v 1.1 1998/08/24 00:52:30 stolcke Exp $
+#
+BEGIN {
+	order = 3;
+
+	head = 0;	# next position in ring buffer
+}
+
+function process_word(w) {
+	buffer[head] = w;
+
+	ngram = "";
+	for (j = 0; j < order; j ++) {
+		w1 = buffer[(head + order - j) % order];
+		if (w1 == "") {
+			break;
+		}
+		ngram = w1 " " ngram;
+		print ngram 1;
+	}
+	head = (head + 1) % order;
+}
+
+{
+	for (i = 1; i <= NF; i ++) {
+		process_word($i);
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/cumbin
+++ b/language_model/srilm-1.7.3/utils/src/cumbin
@@ -0,0 +1,80 @@
+#!/usr/bin/env perl
+
+# This tool calculates probability over the tail of a binomial
+# distribution.  The calculations is done directly, without using any
+# approximations.
+#
+# This program is in the public domain.  It was written
+# by Brett Kessler and David Gelbart.
+#
+
+use warnings;
+use strict;
+use POSIX;
+
+if (@ARGV != 2 && @ARGV != 3) {
+  die "Usage: $0 n k [p]\n";
+}
+my $n = $ARGV[0];
+my $k = $ARGV[1];
+my $p = $ARGV[2];
+
+if (!(defined $p)) {
+  $p = 0.5;
+}
+
+if (($n - $k) > $k) {
+  die "Did you choose the right value of k?\n";
+}
+my $P = tailBinomial($n, $k, $p);
+print "One-tailed: P(k >= ${k} | n=${n}, p=${p}) = ${P}\n";
+$P = 2 * $P;
+print "Two-tailed: 2*P(k >= ${k} | n=${n}, p=${p}) = ${P}\n";
+
+# Calculate the sum over the tail of the binomial probability distribution.
+sub tailBinomial {
+  my($N, $k, $p) = @_;
+
+  my $sum = 0;
+  for (my $i = $k; $i <= $N; $i++) {
+    $sum += exp(logBinomial($N, $i, $p));
+  }
+  $sum;
+}
+
+# We use logarithms during calculation to avoid overflow during the
+# calculation of factorials and underflow during the calculation of
+# powers of probabilities.  This function calculates the log of
+# binomial probability for given N, k, p.
+sub logBinomial {
+  my($N, $k, $p) = @_;
+  my $q = 1 - $p;
+
+  # These safety checks were inspired by the code at
+  # http://faculty.vassar.edu/lowry/binomialX.html
+  die "Error: N not integer" if ($N != floor($N));
+  die "Error: k not integer" if ($k != floor($k));
+  die "Error: k > N" if ($k > $N);
+  die "Error: p > 1" if ($p > 1);
+  die "Error: N < 1" if ($N < 1);
+
+  logBinomCoeff($N, $k) + $k * log($p) + ($N - $k) * log($q);
+}
+
+# Calculcate the log of the binomial coefficient for given N and k.
+sub logBinomCoeff {
+  my($N, $k) = @_;
+  logFactorial($N) - logFactorial($k) - logFactorial($N - $k);
+}
+
+# Calculate the log of the factorial of the argument.
+sub logFactorial {
+  my($N) = @_;
+
+  my $prod = 0;
+  for (my $i = 2; $i <= $N; $i++) {
+    $prod += log($i);
+  }
+
+  $prod;
+}
--- a/language_model/srilm-1.7.3/utils/src/de-vq-lm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/de-vq-lm.gawk
@@ -0,0 +1,89 @@
+#!/usr/local/bin/gawk -f
+#
+# de-vq-lm --
+#	Expand parameters in a quantized ARPA backoff LM
+#
+# usage: de-vq-lm bins=CW lm-file > sub-lm-file
+# 
+# where CW defines the quantization bins.
+#
+# Copyright (c) 2012 Andreas Stolcke, Microsoft Corp.  All Rights Reserved.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/de-vq-lm.gawk,v 1.2 2019/09/09 23:13:15 stolcke Exp $
+#
+
+BEGIN {
+	bins = "/dev/null";
+}
+
+# read the cw file
+#
+#VQSize 256
+#Codeword        Mean    Count
+#       0 -12.7330028909195      10454
+#       1 -12.3314038288506      1494
+# etc.
+#
+NR == 1	{
+	saveline = $0;
+
+	getline < bins;
+	if ($1 != "VQSize") {
+		print "file " bins " is not a VQ file" > "/dev/stderr";
+		exit(1);
+	}
+	vqsize = $2;
+
+	getline < bins;
+	if ($1 != "Codeword") {
+		print "file " bins " is not a VQ file" > "/dev/stderr";
+		exit(1);
+	}
+
+	while ((getline < bins) > 0) {
+		vqbin[$1] = $2;
+	}
+	close(bins);
+
+	$0 = saveline;
+}
+
+NF==0 {
+	print; next;
+}
+/^ngram *[0-9][0-9]*=/ {
+	order = substr($2,1,index($2,"=")-1);
+	print; next;
+}
+/^\\[0-9]-grams:/ {
+	currorder=substr($0,2,1);
+	print; next;
+}
+/^\\/ {
+	print; next;
+}
+
+# 
+# replace VQ index with value in ngram parameter lines
+#
+currorder {
+	if (!($1 in vqbin)) {
+		print "line: " NR ": VQ bin #" $1 "is undefined" > "/dev/stderr";
+		exit(1);
+	}
+	$1 = vqbin[$1];
+
+	# backoff weight, if any
+	if (NF == currorder + 2) {
+	    if (!($NF in vqbin)) {
+		    print "line: " NR ": VQ bin #" $NF "is undefined" > "/dev/stderr";
+		    exit(1);
+	    }
+	    $NF = vqbin[$NF];
+	}
+	
+	print; next;
+}
+
+# pass through anything else
+{ print }
--- a/language_model/srilm-1.7.3/utils/src/empty-sentence-lm
+++ b/language_model/srilm-1.7.3/utils/src/empty-sentence-lm
@@ -0,0 +1,79 @@
+#!/bin/sh
+#
+# empty-sentence-lm --
+#	modify language model to allow the empty sentence.
+#	This adds a "<s> </s>" bigram to the model and scales the 
+#	probabilities of other bigrams starting with <s>.
+#	probabilities.  Backoff weights are recomputed.
+#
+# usage: empty-sentence-lm -prob P -lm oldlm -write-lm newlm
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/empty-sentence-lm,v 1.5 2013/03/09 07:13:01 stolcke Exp $
+#
+
+oldlm=-
+newlm=-
+prob=0.1
+vocab=/dev/null
+norm_option=-renorm
+
+while [ $# -gt 0 ]; do
+	case "$1" in
+	-prob)	prob="$2" ; shift ;;
+	-lm)	oldlm="$2" ; shift ;;
+	-write-lm)	newlm="$2" ; shift ;;
+	-nonorm) norm_option= ; shift ;;
+	*)	options="$options $1" ;;
+	esac
+	shift
+done
+
+gzip -dcf $oldlm | ${GAWK-gawk} '
+function log10(x) {
+        return log(x)/2.30258509299404568402;
+}
+
+/^ngram 2=/ {
+	num = substr($2, 3);
+	print "ngram 2=" num + 1;
+	next;
+}
+
+#
+# add empty-sentence bigram
+#
+/^\\2-grams:/ {
+	print;
+	print log10(prob), "<s> </s>";
+	in_ngrams = 2;
+	next;
+}
+
+#
+# ensure that <s> has backoff weight and 
+# approximately adjust it (correct adjustment done by ngram -renorm)
+#
+in_ngrams == 1 && $2 == "<s>" {
+	$3 += log10(1-prob);
+}
+
+#
+# scale bigram probs starting with <s>
+#
+in_ngrams == 2 && $2 == "<s>" {
+	$1 += log10(1-prob);
+}
+
+/^\\1-grams:/ {
+	in_ngrams = 1;
+}
+
+/^\\3-grams:/ {
+	in_ngrams = 3;
+}
+
+{
+	print;
+}' prob=$prob | \
+ngram -lm - $norm_option -write-lm "$newlm" $options
+
--- a/language_model/srilm-1.7.3/utils/src/extract-skip-probs.gawk
+++ b/language_model/srilm-1.7.3/utils/src/extract-skip-probs.gawk
@@ -0,0 +1,17 @@
+#!/usr/local/bin/gawk -f
+#
+# extract-skip-probs --
+#	Extract the skip probabilities from a Skip-Ngram model
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/extract-skip-probs.gawk,v 1.1 1996/05/20 21:22:09 stolcke Exp $
+#
+NF == 0 {
+	next;
+}
+/\\end\\/ {
+	end_seen = 1;
+	next;
+}
+end_seen {
+	printf "%s %f\n", $1, $2;
+}
--- a/language_model/srilm-1.7.3/utils/src/filter-event-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/filter-event-counts.gawk
@@ -0,0 +1,44 @@
+#!/usr/local/bin/gawk -f
+#
+# filter-event-counts --
+#	Remove from a count file all ngrams that don't correspond to an "event"
+#	for the LM, such that
+#
+#		ngram -order N -lm LM -ppl TEXT
+#	and
+#		ngram-count -order N -text TEXT -write - | \
+#		filter-event-counts order=N | \
+#		ngram -order N -lm LM -counts -
+#
+# 	yield the same result.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/filter-event-counts.gawk,v 1.2 2009/09/25 00:06:50 stolcke Exp $
+#
+BEGIN {
+	order = 3;
+	escape = "";
+
+	sent_start = "<s>";
+}
+
+# pass escaped lines through
+escape != "" && substr($0, 1, length(escape)) == escape {
+	print;
+	next;
+}
+
+# Start-of-sentence ngrams are always included (except for <s> unigram)
+$1 == sent_start {
+	if (NF == 2) {
+		next;
+	} else {
+		print;
+		next;
+	}
+}
+
+# ngrams of highest order
+NF == order + 1 {
+	print;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/find-reference-posteriors.gawk
+++ b/language_model/srilm-1.7.3/utils/src/find-reference-posteriors.gawk
@@ -0,0 +1,89 @@
+#!/usr/local/bin/gawk -f
+#
+# find-reference-posteriors --
+#	tabular the sausage posteriors of reference words
+#
+# usage: find-reference-posteriors posteriors_files=NBEST_POSTERIORS SAUSAGE
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/find-reference-posteriors.gawk,v 1.4 2010/08/20 00:17:18 stolcke Exp $
+#
+
+BEGIN {
+	sentid = "UNKNOWN";
+
+	M_LN10 = 2.30258509299404568402;        # from <math.h>
+	logINF = -320;
+}
+
+function log10(x) {
+	return log(x) / M_LN10;
+}
+function exp10(x) {
+	if (x < logINF) {
+		return 0;
+	} else {
+		return exp(x * M_LN10);
+	}
+}
+function addlogs(x,y) {
+	if (x<y) {
+		temp = x; x = y; y = temp;
+	}
+	return x + log10(1 + exp10(y - x));
+}
+
+NR == 1 {
+    if (posteriors_file) {
+	hypno = 0;
+	num_sources = 0;
+	while ((("gzip -dcf " posteriors_file) | getline pline) > 0) {
+		if (split(pline, a) == 3) {
+			hyp_source[hypno] = a[1];
+			if (a[1] > num_sources) {
+				num_sources = a[1];
+			}
+			hyp_posterior[hypno] = a[3];
+			hypno ++;
+		}
+	}
+	print "read " hypno " posteriors from " num_sources " sources" \
+							>> "/dev/stderr";
+    }
+}
+
+# input format:
+# align 1 hello 0.988212 below 0.00481234 low 0.00331215 ...
+# reference 1 hello
+# hyps 1 hello 0 1 2 3 4 5 6 7 8 9 10 11 16 17 18 19 
+
+$1 == "align" {
+	position = $2;
+
+	delete word_posteriors;
+	for (i = 3; i <= NF; i +=2 ) {
+		word_posteriors[$i] = $(i + 1);
+	}
+}
+
+$1 == "reference" && $2 == position {
+	refword = $3;
+}
+
+$1 == "hyps" && $2 == position && $3 == refword {
+	for (i = 1; i <= num_sources; i ++) {
+		posterior_sum[i] = logINF;
+	}
+	for (i = 4; i <= NF; i ++) {
+		posterior_sum[hyp_source[$i]] = \
+		    addlogs(posterior_sum[hyp_source[$i]], hyp_posterior[$i]);
+	}
+
+	printf "%s %d %s %g", sentid, position, refword, \
+					 word_posteriors[refword];
+
+	for (i = 1; i <= num_sources; i ++) {
+		printf " %g", exp10(posterior_sum[i]);
+	}
+	printf "\n";
+}
+
--- a/language_model/srilm-1.7.3/utils/src/fix-ctm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/fix-ctm.gawk
@@ -0,0 +1,153 @@
+#!/usr/local/bin/gawk -f
+#
+# Post-process CTM files output by lattice-tool -output-ctm to
+# use global conversation-relative time marks and channel ids.
+# (This requires that the waveform names conform to our standard
+# formats, the same as in sentid-to-ctm.)
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/fix-ctm.gawk,v 1.10 2019/02/09 07:30:11 stolcke Exp $
+#
+BEGIN {
+        # time to add to word start times (should be about half FE window size)
+        phase_shift = 0.01;
+
+	tag_pat = "^<.*>$";
+	htk_tag_pat = "^null|^!sent_start|^!sent_end";
+	noise_pat = "^\\[.*\\]$";
+        fragment_pat = "-$";
+	pause = "-pau-";
+
+	channel_letters = 0;
+
+        # hesitations (best deleted for NIST scoring;
+        # should be kept in sync with GLM filter file)
+        hesitation["uh"] = 1;
+        hesitation["um"] = 1;
+        hesitation["eh"] = 1;
+        hesitation["mm"] = 1;
+        hesitation["hm"] = 1;
+        hesitation["ah"] = 1;
+        hesitation["huh"] = 1;
+        hesitation["ha"] = 1;
+        hesitation["er"] = 1;
+        hesitation["oof"] = 1;
+        hesitation["hee"] = 1;
+        hesitation["ach"] = 1;
+        hesitation["eee"] = 1;
+        hesitation["ew"] = 1;
+
+	parse_sentids = 1;
+
+	orig_times = 0;	# DON'T preserve original times
+
+	sort_cmd = "sort -b -k 1,1 -k 2,2 -k 3,3n";
+}
+{
+	sentid = $1;
+	start_time = $3;
+	duration = $4;
+	word = $5;
+	confidence = $6;
+
+	# HTK stuff: strip quotes
+	sub("\"", "", sentid);
+	sub("\"", "", sentid);
+	# archive aliasing info
+	sub("=.*\\[.*\\]$", "", sentid);
+	# standard input file suffixes.
+	sub("\\.plp$", "", sentid);
+	sub("\\.wav$", "", sentid);
+	sub("\\.sph$", "", sentid);
+
+	if (sentid == last_sentid && start_time == "?") {
+		start_time = last_end_time;
+		duration = 0;
+	}
+
+	# exclude sentence start/end tags
+	if (word ~ tag_pat) next;
+	if (tolower(word) ~ htk_tag_pat) next;
+
+	if (sentid == last_sentid) {
+	    if (start_time <= last_start_time) {
+		new_start_time = last_start_time + .01;
+
+		print "warning: " sentid ": word \"" word "\" start time " start_time " " \
+			(start_time < last_start_time ? "is less than" : "equals") \
+			" previous word -- adjusting to " new_start_time > "/dev/stderr";
+
+		start_time = new_start_time;
+	    }
+	}
+
+	if (!parse_sentids) {
+	    conv = sentid;
+	    channel = $2;
+	    start_offset = 0;
+        } else if (match(sentid, "_[0-9]_[-0-9][0-9]*_[0-9][0-9]*$")) {
+           # waveforms with [012] channel id, timemarks 1/1000s
+           # NOTE: this form is used by the segmenter
+           conv = substr(sentid, 1, RSTART-1);
+           split(substr(sentid, RSTART+1), sentid_parts, "_");
+           channel = sentid_parts[1];
+           start_offset = sentid_parts[2] / 1000;
+           end_offset = sentid_parts[3] / 1000;
+        } else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*$")) {
+           conv = substr(sentid, 1, RSTART-1);
+           split(substr(sentid, RSTART+1), sentid_parts, "_");
+           channel = sentid_parts[1];
+           start_offset = sentid_parts[2] / 100;
+           end_offset = sentid_parts[3] / 100;
+	} else {
+           print "cannot parse sentid " sentid >> "/dev/stderr";
+           conv = sentid;
+           channel = 1;
+           start_offset = 0;
+           end_offset = 10000;
+        }
+
+	if (orig_times) {
+	    start_offset = 0;
+	}
+
+	if (channel_letters && channel ~ /^[0-9]/) {
+		channel = sprintf("%c", 64+channel);
+	}
+
+	speaker_id = conv "_" channel;
+
+	ncomps = split(word, word_comps, "_");
+
+	for (j = 1; j <= ncomps; j ++) {
+		this_word = word_comps[j];
+
+		if (this_word == pause) {
+		    next;
+		} else if (this_word in hesitation) {
+		    word_type = "fp";
+		} else if (this_word ~ fragment_pat) {
+		    word_type = "frag";
+		} else if (this_word ~ noise_pat) {
+		    word_type = "non-lex";
+		} else {
+		    word_type = "lex";
+		}
+
+		printf "%s %s %.2f %.2f %s %g %s %s\n", \
+			conv, channel, \
+			start_offset + start_time + phase_shift + \
+				(j - 1) * duration/ncomps,\
+			duration/ncomps, \
+			this_word, \
+			confidence, \
+			word_type, \
+			(word_type == "non-lex" ? \
+				"null" : speaker_id) \
+				 | sort_cmd;
+	}
+
+	last_start_time = start_time;
+	last_end_time = start_time + duration;
+	last_sentid = sentid;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/fsm-to-pfsg.gawk
+++ b/language_model/srilm-1.7.3/utils/src/fsm-to-pfsg.gawk
@@ -0,0 +1,158 @@
+#!/usr/local/bin/gawk -f
+#
+# fsm-to-pfsg --
+#	convert AT&T FSM acceptor to Decipher PFSG format
+#
+# usage: fsm-to-pfsg [pfsg_name=NAME] [transducer=1] [scale=S] file.fsm > file.pfsg
+# pfsg_name=NAME	sets PFSG name to NAME
+# transducer=1		indicates input is a transducer
+# scale=S		sets transition weight scaling factor to S
+#			(default -1)
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/fsm-to-pfsg.gawk,v 1.10 2015-07-03 03:45:38 stolcke Exp $
+#
+BEGIN {
+	pfsg_name = "from_fsm";
+	transducer = 0;		# input is transducer
+
+	if ("TMPDIR" in ENVIRON) {
+	    tmpdir = ENVIRON["TMPDIR"];
+	} else {
+	    tmpdir = "/tmp"
+	}
+
+	if ("pid" in PROCINFO) {
+	    pid = PROCINFO["pid"];
+	} else {
+	    getline pid < "/dev/pid";
+	}
+	tmpfile = tmpdir "/fsm.tmp" pid;
+
+	# hack to remove tmpfile when killed
+	trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
+	print "" | trap_cmd;
+
+	num_newnodes = 0;
+	initial_node = -1;
+	empty_output = "NULL";
+	epsilon = "<eps>";	# FSM epsilon symbol
+	map_epsilon = "";	# map epsilon to this symbol
+	scale = -1;		# scaling of transition weights
+}
+
+# transition description
+NF >= 3 {
+	from_node = $1;
+	to_node = $2;
+
+	if (map_epsilon && $3 == epsilon) $3 = map_epsilon;
+
+	if (transducer) {
+		if (map_epsilon && $4 == epsilon) $4 = map_epsilon;
+
+		# collapse input and output into a single symbol
+		$3 = $3 ":" $4;	
+		$4 = "";
+	}
+
+	output = $3;
+
+	if (initial_node < 0) {
+		initial_node = from_node;
+	}
+
+	
+	# create new node names for pairs of output,old-node
+	if (!(output " " to_node in newnode_table)) {
+		output_table[num_newnodes] = output;
+		newnode_table[output " " to_node] = num_newnodes ++;
+
+		# create list of incoming outputs for each state
+		insymbols[to_node] = insymbols[to_node] " " output;
+	}
+
+	# save for re-reading
+	print $0 > tmpfile;
+	next;
+}
+
+# final state description
+NF >= 1 {
+	node = $1;
+
+	if (initial_node < 0) {
+		initial_node = node;
+	}
+
+	# save for re-reading
+	print $0 > tmpfile;
+	next;
+}
+
+END {
+	close(tmpfile);
+
+	# create initial and final nodes
+	if (!(empty_output " " initial_node in newnode_table)) {
+	    output_table[num_newnodes] = empty_output;
+	    newnode_table[empty_output " " initial_node] = num_newnodes ++;
+	    insymbols[initial_node] = insymbols[initial_node] " " empty_output;
+	}
+	
+	initial_newnode = newnode_table[empty_output " " initial_node];
+	output_table[num_newnodes] = empty_output;
+	final_newnode = num_newnodes++;
+
+	# print PFSG header info
+	print "name " pfsg_name;
+	printf "nodes %d", num_newnodes;
+	for (i = 0; i < num_newnodes; i ++) {
+		printf " %s", output_table[i];
+	}
+	printf "\n";
+	printf "initial %d\n", initial_newnode;
+	printf "final %d\n", final_newnode;
+
+	# re-read FSM description, counting total number of new 
+	# transitions 
+	num_transitions = 0;
+	while (getline < tmpfile) {
+		from_node = $1;
+
+		# duplicate transition for all insymbols of from_node
+		num_transitions += split(insymbols[from_node], a);
+	}
+	close(tmpfile);
+	printf "transitions %d\n", num_transitions;
+
+	# re-read FSM description, outputing new transitions
+	while (getline < tmpfile) {
+	    if (NF >= 3) {
+		from_node = $1;
+		to_node = $2;
+		output = $3;
+		cost = (NF == 3 ? 0 : $4);
+		
+		# duplicate transition for all insymbols of from_node
+		n = split(insymbols[from_node], a);
+		for (i = 1; i <= n; i ++) {
+		    printf "%d %d %d\n", \
+			  newnode_table[a[i] " " from_node], \
+			  newnode_table[output " " to_node], \
+			  scale * cost;
+		}
+	    } else {
+		from_node = $1;
+		cost = (NF == 1 ? 0 : $2);
+
+		# add final transition for all insymbols of from_node
+		n = split(insymbols[from_node], a);
+		for (i = 1; i <= n; i ++) {
+		    printf "%d %d %d\n", \
+			  newnode_table[a[i] " " from_node], \
+			  final_newnode, \
+			  scale * cost;
+		}
+	    }
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/get-gt-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/get-gt-counts.gawk
@@ -0,0 +1,39 @@
+#!/usr/local/bin/gawk -f
+#
+# get-gt-counts --
+#	generate the counts-of-counts required for Good-Turing discounting
+#	assumes the ngrams in the input contain no repetitions
+#
+# usage: get-gt-counts max=<number> out=<name> file ...
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/get-gt-counts.gawk,v 1.5 2016-01-07 17:19:21 stolcke Exp $
+#
+BEGIN {
+	max = 10
+	maxorder = 9;
+}
+{
+	total[NF - 1] ++;
+}
+NF > 1 && $NF <= max {
+	counts[(NF - 1), $NF] ++;
+}
+END {
+	for (order = 1; order <= maxorder; order++) {
+	    if (total[order] > 0) {
+		if (out) {
+		    outfile = out ".gt" order "counts";
+		} else {
+		    outfile = "/dev/stdout";
+		}
+
+		for (i = 0; i <= max; i ++) {
+			c = counts[order, i];
+			print i, c ? c : "0" > outfile;
+		}
+		print "total", total[order] > outfile;
+
+		if (out) close(outfile);
+	    }
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/get-unigram-probs.gawk
+++ b/language_model/srilm-1.7.3/utils/src/get-unigram-probs.gawk
@@ -0,0 +1,38 @@
+#!/usr/local/bin/gawk -f
+#
+# get-unigram-probs --
+#	extract unigram probabilities from backoff LM file
+#
+# usage: get-unigram-probs bo-file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/get-unigram-probs.gawk,v 1.3 2018/06/28 07:45:08 stolcke Exp $
+#
+
+BEGIN {
+	linear = 0;
+
+	currorder = 0;
+	logzero = -99;
+}
+
+/^\\[0-9]-grams:/ {
+	currorder = substr($0,2,1);
+	next;
+}
+
+/^\\/ {
+	currorder = 0;
+	next;
+}
+
+currorder == 1 && NF > 0 {
+	if (NF < 2) {
+	    print "line " NR ": missing word" > "/dev/stderr";
+	} else if (linear) {
+	    print $2, $1 == logzero ? 0 : 10^$1;
+	} else {
+	    print $2, $1 == logzero ? "-infinity" : $1;
+	}
+	next;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/hits-from-log.gawk
+++ b/language_model/srilm-1.7.3/utils/src/hits-from-log.gawk
@@ -0,0 +1,79 @@
+#!/usr/local/bin/gawk -f
+#
+# hits-from-log --
+#	Computes n-gram hit ratios frrom the output of
+#
+#		ngram -debug 2 -ppl 
+#
+#	This is useful if one wants to analyse predictability of certain
+#	words/contexts.
+#
+# Copyright (c) 1995, SRI International.  All Rights Reserved
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/hits-from-log.gawk,v 1.3 1995/10/28 03:59:31 stolcke Exp $
+#
+BEGIN {
+	M_LN10 = 2.30258509299404568402;	# from <math.h>
+}
+/6gram/ {
+	words ++;
+	hits[6] ++;
+	next;
+}
+/5gram/ {
+	words ++;
+	hits[5] ++;
+	next;
+}
+/4gram/ {
+	words ++;
+	hits[4] ++;
+	next;
+}
+/3gram/ {
+	words ++;
+	hits[3] ++;
+	next;
+}
+/3\+Tgram/ {
+	words ++;
+	thits[3] ++;
+	next;
+}
+/2gram/ {
+	words ++;
+	hits[2] ++;
+	next;
+}
+/2\+Tgram/ {
+	words ++;
+	thits[2] ++;
+	next;
+}
+/1gram/ {
+	words ++;
+	hits[1] ++;
+	next;
+}
+/1\+Tgram/ {
+	words ++;
+	thits[1] ++;
+	next;
+}
+{
+	next;
+}
+END {
+	printf "%d words, hit rates:\n", words;
+	for (i = 1; i <= 6; i++) {
+	    if (hits[i]) {
+		printf "%dgrams: %d (%.1f%%) ", i, hits[i], \
+					(hits[i]/words * 100);
+	    }
+	    if (thits[i]) {
+		printf "%d+Tgrams: %d (%.1f%%) ", i, thits[i], \
+					(thits[i]/words * 100);
+	    }
+	}
+	printf "\n";
+}
--- a/language_model/srilm-1.7.3/utils/src/htklat-vocab.gawk
+++ b/language_model/srilm-1.7.3/utils/src/htklat-vocab.gawk
@@ -0,0 +1,50 @@
+#!/usr/local/bin/gawk -f
+#
+# htklat-vocab --
+#	extract vocabulary used in an HTK lattice
+#
+# usage: htklat-vocab HTK-LATTICE ... > VOCAB
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/htklat-vocab.gawk,v 1.3 2004/02/27 21:42:28 stolcke Exp $
+#
+
+BEGIN {
+	null = "!NULL";
+	quotes = 0;
+}
+
+{
+	for (i = 1; i <= NF; i ++) {
+		# skip comments
+		if ($i ~ /^#/) next;
+
+		# Note: this doesn't handle quoted spaces
+		# (as SRILM generally doesn't)
+		if ($i ~ /^W=/ || $i ~ /^WORD=/) {
+		    word = substr($i, index($i, "=") + 1);
+
+		    if (quotes) {
+			# HTK quoting conventions
+			if (word ~ /^['"]/) {
+			    word = substr(word, 2, length(word)-2);
+			}
+			if (word ~ /\\/) {
+			    gsub(/\\\\/, "@QuOtE@", word);
+			    gsub(/\\/, "", word);
+			    gsub(/@QuOtE@/, "\\", word);
+			}
+		    }
+
+		    if (word != null) {
+			is_word[word] = 1;
+		    }
+		}
+	}
+}
+
+END {
+	for (word in is_word) {
+		print word;
+	}
+}
+
--- a/language_model/srilm-1.7.3/utils/src/isclassname.gawk
+++ b/language_model/srilm-1.7.3/utils/src/isclassname.gawk
@@ -0,0 +1,14 @@
+#!/usr/local/bin/gawk -f 
+#
+# Test for classname heuristic used in add-pauses-to-pfsg.gawk
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/isclassname.gawk,v 1.1 2007/10/19 04:16:25 stolcke Exp $
+# 
+
+function is_classname(w) {
+	return w ~ /^\*.*\*$/ || !(w ~ /[[:lower:]]/ || w ~ /[^\x00-\x7F]/);
+}
+
+{
+	print $1 " is " (!is_classname($1) ? "not " : "") "a class name";
+}
--- a/language_model/srilm-1.7.3/utils/src/log10-to-bytelog.gawk
+++ b/language_model/srilm-1.7.3/utils/src/log10-to-bytelog.gawk
@@ -0,0 +1,31 @@
+#!/usr/local/bin/gawk -f
+#
+# log10-to-bytelog --
+#	convert log-base-10 scores to bytelog
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/log10-to-bytelog.gawk,v 1.1 1997/04/22 20:20:41 stolcke Exp $
+#
+BEGIN {
+	logscale = 2.30258509299404568402 * 10000.5 / 1024.0;
+	scale = 1;
+	round = 0.5;
+}
+function rint(x) {
+	if (x < 0) {
+		return int(x - round);
+	} else {
+		return int(x + round);
+	}
+}
+{
+	for (i = 1; i <= NF; i ++) {
+	    if ($i ~ /^[-+.0-9][.0-9]*$/) {
+		    if (round) {
+			$i = scale * rint($i * logscale);
+		    } else {
+			$i = scale * $i * logscale;
+		    }
+	    }
+	}
+	print;
+}
--- a/language_model/srilm-1.7.3/utils/src/make-abs-discount.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-abs-discount.gawk
@@ -0,0 +1,30 @@
+#!/usr/local/bin/gawk -f
+#
+# make-abs-discount --
+#	computes the absolute (constant) discount values from Good-Turing
+#	counts-of-counts statistics.  (Only the n1 and n2 statistics are used.)
+#
+# usage: make-abs-discount COUNTFILE
+#
+# 	where COUNTFILE was created with get-gt-counts.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-abs-discount.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
+#
+$1 == 1 {
+	gt1count = $2;
+}
+$1 == 2 {
+	gt2count = $2;
+}
+END {
+	if (gt1count == 0) {
+		print "n1 count is zero" >> "/dev/stderr";
+		exit 1;
+	}
+	if (gt2count == 0) {
+		print "n2 count is zero" >> "/dev/stderr";
+		exit 1;
+	}
+	print gt1count/(gt1count + 2 * gt2count);
+}
+
--- a/language_model/srilm-1.7.3/utils/src/make-batch-counts
+++ b/language_model/srilm-1.7.3/utils/src/make-batch-counts
@@ -0,0 +1,112 @@
+#!/bin/sh
+#
+# make-batch-counts --
+#	generate n-gram counts in batches
+#
+# A list of data files is partitioned into batches, results from each of
+# which are deposited in a separate ngram-count file.
+#
+# usage: make-batch-count file-list [batch-size [filter \
+#		[countdir [options]]]]
+#
+# file-list	is a file containing a list of data files
+#		(lines starting with # are ignored)
+# batch-size	is the number of input files per batch
+# filter	is preprocessor filter to condition the data 
+# countdir	is the directory where count files are deposited
+# options	are arguments passed on to ngram-count
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-batch-counts,v 1.8 2013/03/19 18:37:52 stolcke Exp $
+#
+
+if [ $# -lt 1 ]; then
+	echo "usage: $0 file-list [batch-size [filter [countdir [options]]]]" >&2
+	exit 2
+fi
+
+filelist=$1
+batchsize=${2-10}
+filter=${3-/bin/cat}
+countdir=${4-./counts}
+
+case $# in
+1)	shift;;
+2)	shift; shift;;
+3)	shift; shift; shift;;
+4)	shift; shift; shift; shift;;
+esac
+
+options="$@"
+
+what=`basename $filelist .files`
+statsfile=$countdir/$what.stats
+infiles=$countdir/$what.files
+
+set -e
+
+if [ ! -d $countdir ]; then
+	mkdir $countdir
+fi
+
+trap 'rm -f $newfile $test_in $test_out; exit 1' 1 2 15
+
+# determine if ngram-count can generate compressed files
+test_in=$countdir/testin
+test_out=$countdir/testout.gz
+
+echo x > $test_in
+ngram-count -text $test_in -write $test_out
+if gzip -l $test_out >/dev/null 2>&1; then
+	gz=.gz
+else
+	gz=
+fi
+rm $test_in $test_out
+
+> $statsfile
+
+#
+# format filelist into one batch per line, preceded by line number
+#
+${GAWK-gawk} -v batchsize=$batchsize \
+'BEGIN {
+	batchno = 1;
+}
+/^#/ || NF == 0 {
+	next;
+}
+{
+	files = files " " $0;
+	numfiles += 1;
+
+	if (numfiles >= batchsize) {
+		print batchno, files;
+		files = "";
+		numfiles = 0;
+		batchno += 1;
+	}
+}
+END {
+	if (numfiles > 0) {
+		print batchno, files;
+	}
+}' $filelist | \
+while read fileno datafiles; do
+	newfile=$countdir/$what-$fileno.ngrams$gz
+
+	# avoid including $datafiles on command line to avoid length limit
+	cat <<EOF >&2
+counting in $newfile sources $datafiles
+EOF
+	
+	echo $datafiles | \
+	xargs $filter | \
+	ngram-count -text - \
+		-tag $newfile \
+		-sort \
+		-write-order 0 \
+		-write $newfile \
+		$options \
+		2>> $statsfile
+done
+
--- a/language_model/srilm-1.7.3/utils/src/make-big-lm
+++ b/language_model/srilm-1.7.3/utils/src/make-big-lm
@@ -0,0 +1,276 @@
+#!/bin/sh
+#
+# make-big-lm --
+#	Create a large ngram language model
+#
+# This script automates various techniques for building large ngram models.
+# It is useful for building LMs that would exceed available real memory
+# if built in one pass by ngram-count.
+# The techiques employed are
+#	- Assume counts are already produced
+#	  (typically using make-batch-counts/merge-batch-counts)
+#	- Compute Good Turing discounts without loading all counts
+#	  into memory.
+#	- ngram-counts loads only those counts exceeding cutoff values.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-big-lm,v 1.25 2015-05-27 08:10:52 stolcke Exp $
+#
+
+name=biglm
+order=3
+gt1min=1
+gt2min=1
+gt3min=2
+gt4min=2
+gt5min=2
+gt6min=2
+gt7min=2
+gt8min=2
+gt9min=2
+gt1max=7
+gt2max=7
+gt3max=7
+gt4max=7
+gt5max=7
+gt6max=7
+gt7max=7
+gt8max=7
+gt9max=7
+kndiscount1=0
+kndiscount2=0
+kndiscount3=0
+kndiscount4=0
+kndiscount5=0
+kndiscount6=0
+kndiscount7=0
+kndiscount8=0
+kndiscount9=0
+ukndiscount1=0
+ukndiscount2=0
+ukndiscount3=0
+ukndiscount4=0
+ukndiscount5=0
+ukndiscount6=0
+ukndiscount7=0
+ukndiscount8=0
+ukndiscount9=0
+using_kn=
+max_per_file=10000000
+ngram_filter=cat
+subset_filter=cat
+counts=
+test_data=
+
+trust_totals=0
+metatag=__meta__	# lowercase so it works with ngram-count -tolower
+
+# avoid locale problems with gawk script computing discounting parameters
+LC_NUMERIC=C; export LC_NUMERIC
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+    -name)	name=$2; shift ;;
+    -order)	order=$2 ; shift ;;
+    -gt1min)	gt1min=$2; options="$options $1 $2" ; shift ;;
+    -gt2min)	gt2min=$2; options="$options $1 $2" ; shift ;;
+    -gt3min)	gt3min=$2; options="$options $1 $2" ; shift ;;
+    -gt4min)	gt4min=$2; options="$options $1 $2" ; shift ;;
+    -gt5min)	gt5min=$2; options="$options $1 $2" ; shift ;;
+    -gt6min)	gt6min=$2; options="$options $1 $2" ; shift ;;
+    -gt7min)	gt7min=$2; options="$options $1 $2" ; shift ;;
+    -gt8min)	gt8min=$2; options="$options $1 $2" ; shift ;;
+    -gt9min)	gt9min=$2; options="$options $1 $2" ; shift ;;
+    -gt1max)	gt1max=$2; using_gt=1; shift ;;
+    -gt2max)	gt2max=$2; using_gt=1; shift ;;
+    -gt3max)	gt3max=$2; using_gt=1; shift ;;
+    -gt4max)	gt4max=$2; using_gt=1; shift ;;
+    -gt5max)	gt5max=$2; using_gt=1; shift ;;
+    -gt6max)	gt6max=$2; using_gt=1; shift ;;
+    -gt7max)	gt7max=$2; using_gt=1; shift ;;
+    -gt8max)	gt8max=$2; using_gt=1; shift ;;
+    -gt9max)	gt9max=$2; using_gt=1; shift ;;
+    -kndiscount1)	kndiscount1=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount2)	kndiscount2=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount3)	kndiscount3=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount4)	kndiscount4=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount5)	kndiscount5=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount6)	kndiscount6=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount7)	kndiscount7=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount8)	kndiscount8=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount9)	kndiscount9=1; ukndiscount1=1; using_kn=1 ;;
+    -kndiscount)	kndiscount1=1; kndiscount2=1; kndiscount3=1;
+			kndiscount4=1; kndiscount5=1; kndiscount6=1;
+			kndiscount7=1; kndiscount8=1; kndiscount9=1;
+			using_kn=1 ;;
+    -ukndiscount1)	kndiscount1=1; using_kn=1 ;;
+    -ukndiscount2)	kndiscount2=1; using_kn=1 ;;
+    -ukndiscount3)	kndiscount3=1; using_kn=1 ;;
+    -ukndiscount4)	kndiscount4=1; using_kn=1 ;;
+    -ukndiscount5)	kndiscount5=1; using_kn=1 ;;
+    -ukndiscount6)	kndiscount6=1; using_kn=1 ;;
+    -ukndiscount7)	kndiscount7=1; using_kn=1 ;;
+    -ukndiscount8)	kndiscount8=1; using_kn=1 ;;
+    -ukndiscount9)	kndiscount9=1; using_kn=1 ;;
+    -ukndiscount)	kndiscount1=1; kndiscount2=1; kndiscount3=1;
+			kndiscount4=1; kndiscount5=1; kndiscount6=1;
+			kndiscount7=1; kndiscount8=1; kndiscount9=1;
+    			ukndiscount1=1; ukndiscount2=1; ukndiscount3=1;
+			ukndiscount4=1; ukndiscount5=1; ukndiscount6=1;
+			ukndiscount7=1; ukndiscount8=1; ukndiscount9=1;
+			using_kn=1 ;;
+    -wbdiscount)	using_wb=1 ;;
+    -wbdiscount*|-cdiscount*|-ndiscount*|-addsmooth*)
+		echo "$0: must use one of GT, KN, UKN, or WB discounting for all orders" >&2
+		exit 2 ;;
+    -read)	if [ "$2" = "" -o "$2" = - -o "$2" = "/dev/stdin" ]; then
+			echo "$0: cannot read from stdin" >&2
+			exit 2
+		fi
+		counts="$counts $2" ; shift ;;
+    -trust-totals) trust_totals=1 ;;
+    -max-per-file) max_per_file=$2 ; shift ;;
+    -ngram-filter) ngram_filter="$2" ; shift ;;
+    -text)	test_data="$2"; shift ;;
+    *)		options="$options $1" ;;
+    esac
+    shift
+done
+
+if [ -z "$counts" ]; then
+    echo "No counts specified" >&2
+    echo "usage: $0 -read COUNTS [-name PATH] [-text TESTSET] [-ngram-filter FILTER] [-max-per-file N] [ngram-count-options ...]" >&2
+    exit 2
+fi
+
+if [ -n "$using_gt" -a -n "$using_kn" -o \
+     -n "$using_gt" -a -n "$using_wb" -o \
+     -n "$using_kn" -a -n "$using_wb" ]
+then
+    echo "$0: cannot mix GT, KN, and WB discounting" >&2
+    exit 2
+fi
+
+if [ $trust_totals -eq 0 ]; then
+    options="$options -meta-tag $metatag"
+else
+    if [ "$using_kn" ]; then
+	echo "$0: -trust-totals incompatible with KN discounting; ignoring it" >&2
+	options="$options -meta-tag $metatag"
+    else
+	options="$options -trust-totals"
+    fi
+fi
+
+set -e
+
+#
+# if KN smoothing is used, compute the modified lower-order counts 
+#
+if [ "$using_kn" ]; then
+    kncounts=$name.kncounts.gz
+    if [ -f $kncounts ]; then
+	echo "using existing $kncounts" >&2
+    elif [ $order -eq 1 ]; then
+    	# create a dummy empty file
+	gzip -f < /dev/null > $kncounts
+    else
+	mkdir -p $name.kndir 
+	gzip -dcf $counts | \
+	eval "$ngram_filter" | \
+	(set -x; make-kn-counts \
+		no_max_order=1 max_per_file=$max_per_file \
+		order=$order \
+		kndiscount1=$kndiscount1 kndiscount2=$kndiscount2 \
+		kndiscount3=$kndiscount3 kndiscount4=$kndiscount4 \
+		kndiscount5=$kndiscount5 kndiscount6=$kndiscount6 \
+		kndiscount7=$kndiscount7 kndiscount8=$kndiscount8 \
+		kndiscount9=$kndiscount9 \
+		output=$name.kndir/kncounts)
+	(set -x; merge-batch-counts $name.kndir)
+
+	# this will fail if more than one count file is left in kndir,
+	# i.e., if merging didn't finish successfully
+	mv `find $name.kndir -name \*.ngrams.gz -print ` $kncounts
+    fi
+
+    options="$options -kn-counts-modified"
+fi
+
+#
+# compute counts-of-counts
+#
+if [ "$using_wb" ]; then
+    :
+elif [ -f $name.gt${order}counts ]; then
+    echo "using existing gtcounts" >&2 
+else
+    if [ "$using_kn" ]; then
+	# concatenate KN modified counts with highest-order original counts
+	# Note: even though $kncounts ends in .gz it might be a plain file
+	# if platform doesn't support gzip pipes, so use gzip -df .
+	gzip -dcf $kncounts | ${GAWK-gawk} 'NF < 1+'$order
+	gzip -dcf $counts | eval "$ngram_filter" | ${GAWK-gawk} 'NF == 1+'$order
+    else 
+        gzip -dcf $counts | eval "$ngram_filter"
+    fi | (set -x; get-gt-counts out=$name max=20 maxorder=$order)
+fi
+
+#
+# compute discount factors
+#
+if [ "$using_wb" ]; then
+    # apply WB discount to all ngram orders
+    gtflags=-wbdiscount
+else
+    gtflags=
+fi
+for n in 1 2 3 4 5 6 7 8 9 
+do
+    if [ $n -le $order -a -f $name.gt${n}counts ]; then
+	if (set +e; eval [ \"\$ukndiscount${n}\" -eq 1 ]); then
+	    gtflags="$gtflags -kn${n} $name.kn${n}"
+	    eval make-kn-discounts modified=0 \
+			min=\$gt${n}min $name.gt${n}counts > $name.kn${n}
+	elif (set +e; eval [ \"\$kndiscount${n}\" -eq 1 ]); then
+	    gtflags="$gtflags -kn${n} $name.kn${n}"
+	    eval make-kn-discounts \
+			min=\$gt${n}min $name.gt${n}counts > $name.kn${n}
+	else 
+	    gtflags="$gtflags -gt${n} $name.gt${n}"
+	    eval make-gt-discounts \
+			min=\$gt${n}min max=\$gt${n}max \
+			$name.gt${n}counts > $name.gt${n}
+	fi
+    fi
+done
+
+# if test data is specified compute context ngrams
+if [ -n "$test_data" -a $order -gt 1 ]; then
+    order1=`expr $order - 1`
+    (set -x; \
+    ngram-count -order $order1 -text "$test_data" -sort -write $name.contexts)
+
+    # ... and filter the ngrams to contain only the required contexts
+    subset_filter="subset-context-ngrams contexts=$name.contexts"
+fi
+
+#
+# filter counts and build lm
+#
+if [ "$using_kn" ]; then
+    # concatenate KN modified counts with highest-order original counts
+    # Note: even though $kncounts ends in .gz it might be a plain file
+    # if platform doesn't support gzip pipes, so use gzip -df .
+    gzip -dcf $kncounts | ${GAWK-gawk} 'NF < 1+'$order
+    gzip -dcf $counts | eval "$ngram_filter" | ${GAWK-gawk} 'NF == 1+'$order
+else 
+    gzip -dcf $counts | eval "$ngram_filter"
+fi | \
+eval "$subset_filter" | \
+(set -x;  \
+ngram-count -read - -read-with-mincounts -order $order \
+	$gtflags \
+	$options)
+
+rm -f $name.contexts
+
--- a/language_model/srilm-1.7.3/utils/src/make-diacritic-map.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-diacritic-map.gawk
@@ -0,0 +1,89 @@
+#!/usr/local/bin/gawk -f
+#
+# make-diacritic-map --
+#	Generate a map from ascii to accented word forms
+#	for use with disambig(1)
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-diacritic-map.gawk,v 1.3 1998/02/04 20:28:02 stolcke Exp $
+#
+/^#/ {
+	next;
+}
+function asciify(word) {
+	gsub("<22>", "A", word);
+	gsub("<22>", "A", word);
+	gsub("<22>", "A", word);
+	gsub("<22>", "A", word);
+	gsub("<22>", "A", word);
+	gsub("<22>", "A", word);
+	gsub("<22>", "AE", word);
+	gsub("<22>", "C", word);
+	gsub("<22>", "E", word);
+	gsub("<22>", "E", word);
+	gsub("<22>", "E", word);
+	gsub("<22>", "E", word);
+	gsub("<22>", "I", word);
+	gsub("<22>", "I", word);
+	gsub("<22>", "I", word);
+	gsub("<22>", "I", word);
+	gsub("<22>", "N", word);
+	gsub("<22>", "O", word);
+	gsub("<22>", "O", word);
+	gsub("<22>", "O", word);
+	gsub("<22>", "O", word);
+	gsub("<22>", "O", word);
+	gsub("<22>", "O", word);
+	gsub("<22>", "U", word);
+	gsub("<22>", "U", word);
+	gsub("<22>", "U", word);
+	gsub("<22>", "U", word);
+	gsub("<22>", "Y", word);
+	gsub("<22>", "ss", word);
+	gsub("<22>", "a", word);
+	gsub("<22>", "a", word);
+	gsub("<22>", "a", word);
+	gsub("<22>", "a", word);
+	gsub("<22>", "a", word);
+	gsub("<22>", "a", word);
+	gsub("<22>", "a", word);
+	gsub("<22>", "c", word);
+	gsub("<22>", "e", word);
+	gsub("<22>", "e", word);
+	gsub("<22>", "e", word);
+	gsub("<22>", "e", word);
+	gsub("<22>", "i", word);
+	gsub("<22>", "i", word);
+	gsub("<22>", "i", word);
+	gsub("<22>", "i", word);
+	gsub("<22>", "n", word);
+	gsub("<22>", "o", word);
+	gsub("<22>", "o", word);
+	gsub("<22>", "o", word);
+	gsub("<22>", "o", word);
+	gsub("<22>", "o", word);
+	gsub("<22>", "u", word);
+	gsub("<22>", "u", word);
+	gsub("<22>", "u", word);
+	gsub("<22>", "u", word);
+	gsub("<22>", "y", word);
+	return word;
+}
+{
+	word = $1;
+	asciiword = asciify(word);
+
+	if (asciiword in map) {
+		map[asciiword] = map[asciiword] " " word;
+	} else {
+		map[asciiword] = word;
+	}
+}
+END {
+	print "<s>\t<s>"
+	print "</s>\t</s>"
+	fflush()
+
+	for (w in map) {
+		print w "\t" map[w] | "sort";
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/make-google-ngrams.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-google-ngrams.gawk
@@ -0,0 +1,124 @@
+#!/usr/local/bin/gawk -f
+#
+# make-google-ngrams --
+#	split ngram count file into an indexed directory structure
+# 	compatible with the Google ngrams distributed by LDC
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-google-ngrams.gawk,v 1.6 2010/08/20 00:17:18 stolcke Exp $
+#
+# usage: zcat counts.gz | make-google-ngrams [dir=DIR] [per_file=N] [gzip=0] [yahoo=1]
+#
+# INPUT DATA is assumed to be a sorted ngram count file
+#
+# 
+# OUTPUT DATA FORMAT
+#
+# a) top-level directory
+#    doc: documentation
+#    data: data
+#    (the top-level structure is required by LDC)
+# b) data directory
+#    one sub-directory per n-gram order: 1gms, 2gms, 3gms, 4gms, 5gms
+#    (separating the orders makes it easier for people to use smaller orders)
+# c) contents of sub-directory 1gms
+#    - file 'vocab.gz' contains the vocabulary sorted by word in unix
+#      sort-order. Each word is on its own line:
+#      WORD <tab> COUNT
+#    - file 'vocab_cs.gz' contains the same data as 'vocab.gz' but
+#      sorted by count.
+#    (need to be 8+3 file names)
+# d) contents of sub-directories 2gms, 3gms, 4gms, 5gms:
+#    - files 'Ngm-KKKK.gz' where N is the order of the n-grams
+#      and KKKK is the zero-padded number of the file. Each file contains
+#      10 million n-gram entries. N-grams are unix-sorted. Each
+#      n-gram occupies one line:
+#      WORD1 <space> WORD2 <space> ... WORDN <tab> COUNT
+#    - file 'Ngm.idx' where N is the order of the n-grams, with one line for
+#      each n-gram file:
+#      FILENAME <tab> FIRST_NGRAM_IN_FILE
+
+BEGIN {
+    dir = "data";
+
+    per_file = 10000000;
+    gzip = 1;
+}
+
+NR == 1 {
+    if (gzip) {
+	gzip_cmd = "gzip";
+	gzip_suff = ".gz";
+    } else {
+	gzip_cmd = "cat";
+	gzip_suff = "";
+    }
+}
+
+# determine ngram length
+{
+    if (yahoo) {
+	order = NF - 5;
+	if (order > 0) {
+	    $NF = $(NF-1) = $(NF-2) = $(NF-3) = "";
+	}
+    } else {
+	order = NF - 1;
+    }
+}
+
+#
+# unigrams
+#
+order == 1 {
+    if (!have_dir[1]) {
+	system("mkdir -p " dir "/1gms");
+	have_dir[1] = 1;
+
+	output_file[1] = gzip_cmd " > " dir "/1gms/vocab" gzip_suff;
+    }
+
+    print | output_file[1];
+    next;
+}
+
+order > 1 {
+    if (output_ngram_count[order] == 0) {
+	output_ngram_count[order] = 1;
+
+	system("mkdir -p " dir "/" order "gms");
+	if (output_file[order]) close(output_file[order]);
+	    output_name = sprintf("%dgm-%04d%s", order, output_file_count[order] ++, gzip_suff);
+	output_file[order] = gzip_cmd " > " dir "/" order "gms/" output_name;
+
+	ngram = $1;
+	for (i = 2; i <= order; i ++) {
+	    ngram = ngram " " $i;
+	}
+
+	print output_name "\t" ngram > (dir "/" order "gms/" order "gm.idx");
+    }
+
+    print | output_file[order];
+
+    output_ngram_count[order] += 1;
+    output_ngram_count[order] %= (per_file + 1);
+    next;
+}
+
+order < 1 {
+    print FILENAME ": " FNR ": insufficient number of fields" > "/dev/stderr";
+    print $0 > "/dev/stderr";
+    exit(1);
+}
+
+#
+# sort unigrams by count
+#
+END {
+    close(output_file[1]);
+
+    if (have_dir[1]) {
+	system("gzip -dcf " dir "/1gms/vocab" gzip_suff " | sort -k 2,2rn | " gzip_cmd " > " dir "/1gms/vocab_cs" gzip_suff);
+    }
+}
+
--- a/language_model/srilm-1.7.3/utils/src/make-gt-discounts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-gt-discounts.gawk
@@ -0,0 +1,76 @@
+#!/usr/local/bin/gawk -f
+#
+# make-gt-discounts --
+#	generate Good-Turing discounting parameters from a count-of-count
+#	file
+#
+#	The purpose of this script is to do the GT computation off-line,
+#	without ngram-count having to read all counts into memory.
+#	The output is compatible with the ngram-count -gt<n> options.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-gt-discounts.gawk,v 1.3 2004/11/02 02:00:35 stolcke Exp $
+#
+# usage: make-gt-discounts min=<mincount> max=<maxcount> countfile
+#
+BEGIN {
+    min=1;
+    max=7;
+}
+/^#/ {
+    # skip comments
+    next;
+}
+{
+    countOfCounts[$1] = $2;
+}
+END {
+    # Code below is essentially identical to GoodTuring::estimate()
+    # (Discount.cc).
+    minCount = min;
+    maxCount = max;
+
+    if (!countOfCounts[1]) {
+	printf "warning: no singleton counts\n" >> "/dev/stderr";
+	maxCount = 0;
+    }
+
+    while (maxCount > 0 && countOfCounts[maxCount + 1] == 0) {
+	printf "warning: count of count %d is zero -- lowering maxcount\n", \
+	       maxCount + 1 >> "/dev/stderr";
+	maxCount --;
+    }
+
+    if (maxCount <= 0) {
+	printf "GT discounting disabled\n" >> "/dev/stderr";
+    } else {
+	commonTerm = (maxCount + 1) * \
+				countOfCounts[maxCount + 1] / \
+				    countOfCounts[1];
+
+	for (i = 1; i <= maxCount; i++) {
+
+	    if (countOfCounts[i] == 0) {
+		printf "warning: count of count %d is zero\n", \
+			i >> "/dev/stderr";
+		coeff = 1.0;
+	    } else {
+		coeff0 = (i + 1) * countOfCounts[i+1] / \
+					    (i * countOfCounts[i]);
+		coeff = (coeff0 - commonTerm) / (1.0 - commonTerm);
+		if (coeff <= 0 || coeff0 > 1.0) {
+		    printf "warning: discount coeff %d is out of range: %g\n", \
+			 i, coeff >> "/dev/stderr";
+		    coeff = 1.0;
+		}
+	    }
+	    discountCoeffs[i] = coeff;
+	}
+    }
+
+    printf "mincount %d\n", minCount;
+    printf "maxcount %d\n", maxCount;
+
+    for (i = 1; i <= maxCount; i++) {
+	printf "discount %d %g\n", i, discountCoeffs[i];
+    }
+}
--- a/language_model/srilm-1.7.3/utils/src/make-hiddens-lm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-hiddens-lm.gawk
@@ -0,0 +1,100 @@
+#!/usr/local/bin/gawk -f
+#
+# make-hiddens-lm --
+#	Create a hidden-sentence-boundary ngram LM from a standard one
+#
+# This script edits a ARPA backoff model file as follows:
+#
+# 1 - ngrams involving <s> and </s> are duplicated using the
+#     hidden segment boundary token <#s>.
+# 2 - ngrams starting with <s> are eliminated.
+# 3 - the backoff weight of <s> is set to 1.
+#     this together with the previous change sets all probabilities conditioned
+#     on <s> to the respective marignal probabilities without <s>.
+# 4 - ngrams ending in </s> get probability 1.
+#     this avoids an end-of-sentence penalty in rescoring.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-hiddens-lm.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $
+#
+BEGIN {
+	sent_start = "<s>";
+	sent_end = "</s>";
+	hiddens = "<#s>";
+
+	remove_old_ngrams = 0;
+}
+NF==0 {
+	print; next;
+}
+/^ngram *[0-9][0-9]*=/ {
+	print;
+	next;
+}
+/^.[0-9]-grams:/ {
+	currorder=substr($0,2,1);
+}
+/^\\/ {
+	print; next;
+}
+# 
+currorder && currorder < highorder {
+	if (NF < currorder + 2) {
+		print $0 "\t0";
+	} else {
+		print;
+	}
+	next;
+}
+$0 ~ sent_start || $0 ~ sent_end {
+	oldline = $0;
+
+	# modify sentence initial/final ngrams
+	if ($2 == sent_end && currorder == 1) {
+	    sos_uniprob = $1;
+
+	    if (no_s_end) {
+		# set </s> prob to 1
+		$1 = 0;
+	    }
+	    if (!remove_old_ngrams) {
+		print;
+	    }
+	    next;
+	} else if ($2 == sent_start && currorder == 1) {
+	    if (no_s_start) {
+		# set <s> backoff weight to 1
+		$3 = 0;
+	    }
+	    if (!remove_old_ngrams) {
+		print;
+	    }
+
+	    # use unigram prob from </s>
+	    if (sos_uniprob == "") {
+		print "warning: could not find " sent_end " unigram" \
+							    >> "/dev/stderr";
+	    } else {
+		oldline = sos_uniprob "\t" $2 "\t" $3;
+	    }
+	} else if ($2 == sent_start) {
+	    # suppress other ngrams starting with <s>
+	    if (!no_s_start && !remove_old_ngrams) {
+		print;
+	    }
+	} else if ($(currorder + 1) == sent_end) {
+	    if (no_s_end) {
+		# set </s> prob to 1
+		$1 = 0;
+	    }
+	    if (!remove_old_ngrams) {
+	        print;
+	    }
+	}
+
+	# replace <s> and </s> with <#s> and output result
+	gsub(sent_start, hiddens, oldline);
+	gsub(sent_end, hiddens, oldline);
+	print oldline;
+	next;
+}
+{ print }
--- a/language_model/srilm-1.7.3/utils/src/make-kn-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-kn-counts.gawk
@@ -0,0 +1,82 @@
+#!/usr/local/bin/gawk -f
+#
+# make-kn-counts --
+#	Modify N-gram counts for KN smoothing
+#
+# This duplicates the action of ModKneserNey::prepareCounts().
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-kn-counts.gawk,v 1.5 2007/06/16 04:51:18 stolcke Exp $
+#
+BEGIN {
+	order = 3;
+	no_max_order = 0;
+
+	sent_start = "<s>";
+
+	ngram_count = "ngram-count";
+
+	output = "-";
+	max_per_file = 0;
+
+	file_no = 0;
+	ngram_no = 0;
+
+}
+
+function set_output () {
+	close(output_cmd);
+
+	ngram_cmd = ngram_count " -order " order " -read - -sort -write ";
+
+	if (max_per_file > 0) {
+		output_cmd = ngram_cmd output "-" ++file_no ".ngrams.gz";
+	} else {
+		output_cmd = ngram_cmd output;
+	}
+}
+
+
+NR == 1 {
+	kndiscount[1] = kndiscount1;
+	kndiscount[2] = kndiscount2;
+	kndiscount[3] = kndiscount3;
+	kndiscount[4] = kndiscount4;
+	kndiscount[5] = kndiscount5;
+	kndiscount[6] = kndiscount6;
+	kndiscount[7] = kndiscount7;
+	kndiscount[8] = kndiscount8;
+	kndiscount[9] = kndiscount9;
+
+	if (output == "-") {
+		max_per_file = 0;
+	}
+	set_output();
+}
+
+# discard ngrams not used in LM building
+NF - 1 > order {
+	next;
+}
+# keep ngrams not subject to KN discounting, or those starting with <s>
+# if desired, highest-order ngrams are discarded to save space 
+NF - 1 == order || !kndiscount[NF - 1] || $1 == sent_start {
+	if (!no_max_order || NF - 1 < order) {
+	    if (max_per_file > 0 && ++ngram_no % max_per_file == 0) {
+		ngram_no = 0;
+		set_output();
+	    }
+	    print | output_cmd;
+	}
+}
+# modify lower-order ngrams subject to KN discounting
+NF - 2 < order && kndiscount[NF - 2] && $2 != sent_start {
+	$1 = $NF = "";
+
+	if (max_per_file > 0 && ++ngram_no % max_per_file == 0) {
+	    ngram_no = 0;
+	    set_output();
+	}
+
+	# we let ngram-count add up the new counts for us
+	print $0, 1 | output_cmd;
+}
--- a/language_model/srilm-1.7.3/utils/src/make-kn-discounts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-kn-discounts.gawk
@@ -0,0 +1,119 @@
+#!/usr/local/bin/gawk -f
+#
+# make-kn-discounts --
+#	generate modified Kneser-Ney discounting parameters from a
+#	count-of-count file
+#
+#	The purpose of this script is to do the KN computation off-line,
+#	without ngram-count having to read all counts into memory.
+#	The output is compatible with the ngram-count -kn<n> options.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-kn-discounts.gawk,v 1.7 2015-05-27 08:10:52 stolcke Exp $
+#
+# usage: make-kn-discounts modified=<0|1> min=<mincount> countfile
+#
+BEGIN {
+    min = 1;
+    modified = 1;
+}
+
+/^#/ {
+    # skip comments
+    next;
+}
+
+{
+    countOfCounts[$1] = $2;
+    if ($1 != "total" && $1 > maxCount && $2 > 0) {
+	maxCount = $1;
+    }
+}
+
+#
+# Estimate missing counts-of-counts f(k) based on the empirical law
+#
+#	log f(k) - log f(k+1) = a / k
+#
+# for some constant a dependent on the distribution.
+#
+function handle_missing_counts() {
+
+    #
+    # compute average a value based on well-defined counts-of-counts
+    #
+    a_sum = 0;
+
+    for (k = maxCount - 1; k > 0; k --) {
+	if (countOfCounts[k] == 0) break;
+
+	a =  k * (log(countOfCounts[k]) - log(countOfCounts[k + 1]));
+
+	if (debug) {
+		print "k = " k ", a = " a > "/dev/stderr";
+	}
+
+	a_sum += a;
+    }
+
+    if (maxCount - 1 == k) {
+	# no data to estimate a, give up
+	return;
+    }
+
+    avg_a = a_sum / (maxCount - k - 1);
+
+    if (debug) {
+	print "average a = " avg_a > "/dev/stderr";
+    }
+
+    ## print "avg_a", avg_a > "/dev/stderr";
+
+    for ( ; k > 0; k --) {
+	if (countOfCounts[k] == 0) {
+	    countOfCounts[k] = exp(log(countOfCounts[k + 1]) + avg_a / k);
+
+	    print "estimating missing count-of-count " k \
+					" = " countOfCounts[k] > "/dev/stderr";
+	}
+    }
+}
+
+END {
+    # Code below is essentially identical to ModKneserNey::estimate()
+    # (Discount.cc).
+
+    handle_missing_counts();
+
+    if (countOfCounts[1] == 0 || \
+	countOfCounts[2] == 0 || \
+	modified && countOfCounts[3] == 0 || \
+	modified && countOfCounts[4] == 0) \
+    {
+	printf "error: one of required counts of counts is zero\n" \
+	       						>> "/dev/stderr";
+	exit(2);
+    }
+
+    Y = countOfCounts[1]/(countOfCounts[1] + 2 * countOfCounts[2]);
+
+    if (modified) {
+	discount1 = 1 - 2 * Y * countOfCounts[2] / countOfCounts[1];
+	discount2 = 2 - 3 * Y * countOfCounts[3] / countOfCounts[2];
+	discount3plus = 3 - 4 * Y * countOfCounts[4] / countOfCounts[3];
+    } else {
+	# original KN discounting
+	discount1 = discount2 = discount3plus = Y;
+    }
+
+    print "mincount", min;
+    print "discount1", discount1;
+    print "discount2", discount2;
+    print "discount3+", discount3plus;
+
+    # check for invalid values after output, so we see where the problem is 
+    if (discount1 < 0 || discount2 < 0 || discount3plus < 0) {
+	printf "error: one of modified KneserNey discounts is negative\n" \
+	       						>> "/dev/stderr";
+	exit(2);
+    }
+}
--- a/language_model/srilm-1.7.3/utils/src/make-lm-subset.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-lm-subset.gawk
@@ -0,0 +1,32 @@
+#!/usr/local/bin/gawk -f
+#
+# filter a backoff model with a count file, so that only ngrams
+# in the countfile are represented in the output
+#
+# usage: make-lm-subset count-file bo-file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-lm-subset.gawk,v 1.3 1999/10/17 06:10:10 stolcke Exp $
+#
+ARGIND==1 {
+	ngram = $0;
+	sub("[ 	]*[0-9]*$", "", ngram);
+	count[ngram] = 1;
+	next;
+}
+ARGIND==2 && /^$/ {
+	print; next;
+}
+ARGIND==2 && /^\\/ {
+	print; next;
+}
+ARGIND==2 && /^ngram / {
+	print; next;
+}
+ARGIND==2 {
+	ngram = $0;
+	# strip numeric stuff
+	sub("^[-.e0-9]*[ 	]*", "", ngram);
+	sub("[ 	]*[-.e0-9]*$", "", ngram);
+	if (count[ngram]) print;
+	next;
+}
--- a/language_model/srilm-1.7.3/utils/src/make-meta-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-meta-counts.gawk
@@ -0,0 +1,73 @@
+#!/usr/local/bin/gawk -f
+#
+# make-meta-counts --
+#	Apply N-gram count cut-offs and insert meta-counts (counts-of-counts)
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-meta-counts.gawk,v 1.2 2002/07/22 21:24:45 stolcke Exp $
+#
+BEGIN {
+	order = 3;
+	# trust_total=1 means we don't have to generate meta-counts, just
+	# apply the cut-offs (in combination with ngram-count -trust-totals)
+	trust_totals = 0;
+	metatag = "__META__";
+}
+
+NR == 1 {
+	mincount[1] = mincount1 + 0;
+	mincount[2] = mincount2 + 0;
+	mincount[3] = mincount3 + 0;
+	mincount[4] = mincount4 + 0;
+	mincount[5] = mincount5 + 0;
+	mincount[6] = mincount6 + 0;
+	mincount[7] = mincount7 + 0;
+	mincount[8] = mincount8 + 0;
+	mincount[9] = mincount9 + 0;
+}
+
+NF > order + 1 {
+	next;
+}
+
+NF > 1 {
+    this_order = NF - 1;
+
+    if (!trust_totals) {
+	# output buffered ngrams of higher order IF there was at least 
+	# one non-meta count of the respective order
+	for (i = order; i > this_order; i --) {
+	    if (have_counts[i]) {
+		printf "%s", buffer[i];
+		have_counts[i] = 0;
+	    }
+	    delete buffer[i];
+	}
+    }
+
+    if ($NF < mincount[this_order]) {
+	if (trust_totals) {
+	    next;
+	} else {
+	    # convert below-cutoff ngram to meta-ngram
+	    $this_order = metatag int($NF);
+	    $NF = 1;
+
+	    # add it to buffer
+	    buffer[this_order] = buffer[this_order] $0 "\n";
+	}
+    } else {
+	have_counts[this_order] = 1;
+	print;
+    }
+
+}
+
+END {
+    # output any remaining buffered ngrams
+    for (i = order; i >= 1; i --) {
+	if (have_counts[i]) {
+	    printf "%s", buffer[i];
+	}
+    }
+}
+
--- a/language_model/srilm-1.7.3/utils/src/make-multiword-pfsg
+++ b/language_model/srilm-1.7.3/utils/src/make-multiword-pfsg
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# make-multiword-pfsg --
+#	rewrite a PFSG in terms of multiwords
+#
+# usage: make-multiword-pfsg multiword-defs [pfsg] > new-pfsg
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-multiword-pfsg,v 1.5 2015-07-03 03:45:39 stolcke Exp $
+#
+
+multiword_defs=${1}
+shift
+
+tmpdir=${TMPDIR-/tmp}
+name="$tmpdir/name.$$"
+vocab="$tmpdir/vocab.$$"
+old_fsm="$tmpdir/infsm.$$.gz"
+class_fsm="$tmpdir/classfsm.$$"
+class_fsmc="$tmpdir/classfsmc.$$"
+mw_symbols="$tmpdir/mw_symbols.$$"
+word_symbols="$tmpdir/word_symbols.$$"
+
+trap "rm -f $name $vocab $old_fsm $class_fsm $class_fsmc $mw_symbols $word_symbols; exit" 0 1 2 15
+
+#
+# extract vocab and convert PFSG to FSM
+#
+${GAWK-gawk} -v name=$name -v vocab=$vocab '$1 == "name" && !have_name {
+	have_name = 1;
+	print $2 > name;
+}
+$1 == "nodes" {
+	# collect vocabulary
+	for (i = 3; i <= NF; i ++) {
+		if ($i != "NULL") is_word[$i] = 1;
+	}
+}
+{	print;
+}
+END {
+	for (word in is_word) {
+		print word > vocab
+	}
+}' "$@" | \
+pfsg-to-fsm symbolic=1 | \
+gzip > $old_fsm
+
+new_name=`cat $name`_multiwords
+
+#
+# create multiword transducer
+# Note: this is the same as reversed class-transducer
+#
+classes-to-fsm vocab=$vocab symbolic=1 \
+	isymbolfile=$mw_symbols \
+	osymbolfile=$word_symbols \
+	$multiword_defs > $class_fsm
+
+fsmcompile -t -i $mw_symbols -o $word_symbols $class_fsm | \
+	fsminvert > $class_fsmc
+
+#
+# compose original FSM with multiword transducer;
+# then convert back to PFSG
+#
+{ gzip -dcf $old_fsm; rm -f $old_fsm; } | fsmcompile -i $word_symbols | \
+fsmcompose - $class_fsmc | fsmproject -o | \
+fsmprint -i $mw_symbols | fsm-to-pfsg pfsg_name=$new_name 
+
+
--- a/language_model/srilm-1.7.3/utils/src/make-nbest-pfsg.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-nbest-pfsg.gawk
@@ -0,0 +1,135 @@
+#!/usr/local/bin/gawk -f
+#
+# nbest2pfsg --
+#	convert Decipher N-best list to PFSG lattice
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-nbest-pfsg.gawk,v 1.5 2004/11/02 02:00:35 stolcke Exp $
+#
+BEGIN {
+	initial = 0;
+	final = 1;
+	nodecount = 2;
+	transcount = 0;
+
+	null = "NULL";
+
+	outputs[initial] = outputs[final] = null;
+
+	format = 0;
+	name = "";
+
+	notree = 0;	# do build prefix tree
+
+	scale = 0;	# scaling factor for log posteriors
+	amw = 1;	# acoustic model weight
+	lmw = 8;	# language model weight
+	wtw = 0;	# word transition weight
+}
+
+function start_hyp() {
+	lastnode = initial;
+}
+
+function add_word(word, weight) {
+	nextnode = tree[lastnode " " word];
+	if (nextnode && !notree) {
+		if (weights[lastnode " " nextnode] != weight) {
+			printf "inconsistent weight for transition %s -> %s\n",\
+				lastnode, nextnode >> "/dev/stderr";
+			exit 1;
+		}
+
+		lastnode = nextnode;
+	} else {
+		newnode = nodecount ++;
+		outputs[newnode] = word;
+
+		tree[lastnode " " word] = newnode;
+		weights[lastnode " " newnode] = weight;
+		transcount ++;
+
+		lastnode = newnode;
+	}
+}
+
+function end_hyp(weight) {
+	nextnode = tree[lastnode " " null];
+	if (nextnode && !notree) {
+		if (weights[lastnode " " nextnode] != weight) {
+			printf "inconsistent final weight for %s\n",\
+						lastnode >> "/dev/stderr";
+			exit 1;
+		}
+	} else {
+		tree[lastnode " " null] = final;
+		weights[lastnode " " final] = weight;
+		transcount ++;
+	}
+}
+
+function print_pfsg(name) {
+
+	printf "name %s\n", name;
+	printf "nodes %d", nodecount;
+	for (node = 0; node < nodecount; node ++) {
+		printf " %s", outputs[node];
+	}
+	printf "\n";
+
+	printf "initial %d\n", initial;
+	printf "final %d\n", final;
+
+	printf "transitions %d\n", transcount;
+
+	for (trans in weights) {
+		split(trans, a);
+		fromnode = a[1];
+		tonode = a[2];
+
+		printf "%d %d %g\n", fromnode, tonode, \
+					weights[fromnode " " tonode];
+	}
+	printf "\n";
+}
+
+/^NBestList1\.0/ {
+	format = 1;
+	next;
+}
+/^NBestList2\.0/ {
+	format = 2;
+	next;
+}
+format == 0 {
+	totalscore = scale * (amw * $1 + lmw * $2 + wtw * $3);
+	start_hyp();
+	for (i = 4; i <= NF; i ++) {
+		add_word($i, 0);
+	}
+	end_hyp(totalscore);
+	next;
+}
+format == 1 {
+	totalscore = scale * substr($1, 2, length($1)-2);
+	start_hyp();
+	for (i = 2; i <= NF; i ++) {
+		add_word($i, 0);
+	}
+	end_hyp(totalscore);
+	next;
+}
+format == 2 {
+	start_hyp();
+	for (i = 2; i <= NF; i += 11) {
+		add_word($i, scale * ($(i + 7) + $(i + 9)));
+	}
+	end_hyp(0);
+	next;
+}
+END {
+	if (!name) {
+		name = FILENAME;
+	}
+	print_pfsg(name);
+}
+
--- a/language_model/srilm-1.7.3/utils/src/make-ngram-pfsg.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-ngram-pfsg.gawk
@@ -0,0 +1,351 @@
+#!/usr/local/bin/gawk -f
+#
+# make-ngram-pfsg --
+#	Create a Decipher PFSG from an N-gram language model
+#
+# usage: make-ngram-pfsg [debug=1] [check_bows=1] [maxorder=N] [no_empty_bo=1] backoff-lm > pfsg
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-ngram-pfsg.gawk,v 1.32 2015-07-03 03:45:38 stolcke Exp $
+#
+
+#########################################
+#
+# Output format specific code
+#
+
+BEGIN {
+	logscale = 2.30258509299404568402 * 10000.5;
+	round = 0.5;
+	start_tag = "<s>";
+	end_tag = "</s>";
+	null = "NULL";
+	version = 0;
+	top_level_name = "";
+	no_empty_bo = 0;
+
+	if ("TMPDIR" in ENVIRON) {
+	    tmpdir = ENVIRON["TMPDIR"];
+	} else {
+	    tmpdir = "/tmp"
+	}
+
+	if ("pid" in PROCINFO) {
+	    pid = PROCINFO["pid"];
+	} else {
+	    getline pid < "/dev/pid";
+	}
+	tmpfile = tmpdir "/pfsg." pid;
+
+	# hack to remove tmpfile when killed
+	trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
+	print "" | trap_cmd;
+
+	debug = 0;
+
+	write_contexts = "";
+	read_contexts = "";
+}
+
+function rint(x) {
+	if (x < 0) {
+	    return int(x - round);
+	} else {
+	    return int(x + round);
+	}
+}
+
+function scale_log(x) {
+	return rint(x * logscale);
+}
+
+function output_for_node(name) {
+	num_words = split(name, words);
+
+	if (num_words == 0) {
+	    print "output_for_node: got empty name" >> "/dev/stderr";
+	    exit(1);
+	} else if (words[1] == bo_name) {
+	    return null;
+	} else if (words[num_words] == end_tag || \
+		   words[num_words] == start_tag) 
+	{
+	    return null;
+	} else {
+	    return words[num_words];
+	}
+}
+
+function node_exists(name) {
+	return (name in node_num);
+}
+
+function node_index(name) {
+	i = node_num[name];
+	if (i == "") {
+	    i = num_nodes ++;
+	    node_num[name] = i;
+	    node_string[i] = output_for_node(name);
+
+	    if (debug) {
+		print "node " i " = " name ", output = " node_string[i] \
+				>> "/dev/stderr";
+	    }
+	}
+	return  i;
+}
+
+function start_grammar(name) {
+	num_trans = 0;
+	num_nodes = 0;
+	return;
+}
+
+function end_grammar(name) {
+	if (!node_exists(start_tag)) {
+		print start_tag " tag undefined in LM" >> "/dev/stderr";
+		exit(1);
+	} else if (!node_exists(end_tag)) {
+		print end_tag " tag undefined in LM" >> "/dev/stderr";
+		exit(1);
+	}
+
+	printf "%d pfsg nodes\n", num_nodes >> "/dev/stderr";
+	printf "%d pfsg transitions\n", num_trans >> "/dev/stderr";
+
+	# output version id if supplied
+	if (version) {
+		print "version " version "\n";
+	}
+
+	# use optional top-level grammar name if given
+	print "name " (top_level_name ? top_level_name : name);
+	printf "nodes %s", num_nodes;
+	for (i = 0; i < num_nodes; i ++) {
+		printf " %s", node_string[i];
+	}
+	printf "\n";
+	
+	print "initial " node_index(start_tag);
+	print "final " node_index(end_tag);
+	print "transitions " num_trans;
+	fflush();
+
+	if (close(tmpfile) < 0) {
+		print "error closing tmp file" >> "/dev/stderr";
+		exit(1);
+	}
+	system("/bin/cat " tmpfile);
+}
+
+function add_trans(from, to, prob) {
+	if (debug) {
+	    print "add_trans " from " -> " to " " prob >> "/dev/stderr";
+	}
+	num_trans ++;
+	print node_index(from), node_index(to), scale_log(prob) > tmpfile;
+}
+
+#########################################
+#
+# Generic code for parsing backoff file
+#
+
+BEGIN {
+	maxorder = 0;
+	grammar_name = "PFSG";
+	bo_name = "__BACKOFF__";
+	start_bo_name = bo_name " __FROM_START__";
+	check_bows = 0;
+	epsilon = 1e-5;		# tolerance for lowprob detection
+}
+
+NR == 1 {
+	start_grammar(grammar_name);
+	
+	if (read_contexts) {
+	    while ((getline context < read_contexts) > 0) {
+		is_context[context] = 1;
+	    }
+	    close(read_contexts);
+	}
+}
+
+NF == 0 {
+	next;
+}
+
+/^ngram *[0-9][0-9]*=/ {
+	num_grams = substr($2,index($2,"=")+1);
+	if (num_grams > 0) {
+	    order = substr($2,1,index($2,"=")-1);
+	
+	    # limit maximal N-gram order if desired
+	    if (maxorder > 0 && order > maxorder) {
+		order = maxorder;
+	    }
+
+	    if (order == 1) {
+		grammar_name = "UNIGRAM_PFSG";
+	    } else if (order == 2) {
+		grammar_name = "BIGRAM_PFSG";
+	    } else if (order == 3) {
+		grammar_name = "TRIGRAM_PFSG";
+	    } else {
+		grammar_name = "NGRAM_PFSG";
+	    }
+	}
+	next;
+}
+
+/^\\[0-9]-grams:/ {
+	currorder = substr($0,2,1);
+	next;
+}
+/^\\/ {
+	next;
+}
+
+#
+# unigram parsing
+#
+currorder == 1 {
+	first_word = last_word = ngram = $2;
+	ngram_prefix = ngram_suffix = "";
+
+	# we need all unigram backoffs (except for </s>),
+	# so fill in missing bow where needed
+	if (NF == 2 && last_word != end_tag) {
+		$3 = 0;
+	}
+}
+
+#
+# bigram parsing
+#
+currorder == 2 {
+	ngram_prefix = first_word = $2;
+	ngram_suffix = last_word = $3;
+	ngram = $2 " " $3;
+}
+
+#
+# trigram parsing
+#
+currorder == 3 {
+	first_word = $2;
+	last_word = $4;
+	ngram_prefix = $2 " " $3;
+	ngram_suffix = $3 " " $4;
+	ngram = ngram_prefix " " last_word;
+}
+
+#
+# higher-order N-gram parsing
+#
+currorder >= 4 && currorder <= order {
+	first_word = $2;
+	last_word = $(currorder + 1);
+	ngram_infix = $3;
+	for (i = 4; i <= currorder; i ++ ) {
+		ngram_infix = ngram_infix " " $i;
+	}
+	ngram_prefix = first_word " " ngram_infix;
+	ngram_suffix = ngram_infix " " last_word;
+	ngram = ngram_prefix " " last_word;
+}
+
+# 
+# shared code for N-grams of all orders
+#
+currorder <= order {
+	prob = $1;
+	bow = $(currorder + 2);
+
+	# skip backoffs that exceed maximal order,
+	# but always include unigram backoffs
+	if (bow != "" && (currorder == 1 || currorder < order)) {
+	    # remember all LM contexts for creation of N-gram transitions
+	    bows[ngram] = bow;
+
+	    # To avoid empty paths through backoff, we reroute transitions
+	    # out of the start node to a special backoff node that does not
+	    # connect directly to the end node.
+	    if (no_empty_bo && ngram == start_tag) {
+		this_bo_name = start_bo_name;
+	    } else {
+		this_bo_name = bo_name;
+	    }
+
+	    # insert backoff transitions
+	    if (read_contexts ? (ngram in is_context) : \
+		                (currorder < order - 1)) \
+	    {
+		add_trans(this_bo_name " " ngram, this_bo_name " " ngram_suffix, bow);
+		add_trans(ngram, this_bo_name " " ngram, 0);
+	    } else {
+		add_trans(ngram, this_bo_name " " ngram_suffix, bow);
+	    }
+
+	    if (write_contexts) {
+		print ngram_suffix > write_contexts;
+	    }
+	}
+
+	if (last_word == start_tag) {
+	    if (currorder > 1) {
+		printf "warning: ignoring ngram into start tag %s -> %s\n", \
+			    ngram_prefix, last_word >> "/dev/stderr";
+	    }
+	} else {
+	    # insert N-gram transition to maximal suffix of target context
+	    if (last_word == end_tag) {
+		target = end_tag;
+	    } else if (ngram in bows || currorder == 1) {
+		# the minimal context is unigram
+		target = ngram;
+	    } else if (ngram_suffix in bows) {
+		target = ngram_suffix;
+	    } else {
+		target = ngram_suffix;
+		for (i = 3; i <= currorder; i ++) {
+		    target = substr(target, length($i) + 2);
+		    if (target in bows) break;
+		}
+	    }
+
+	    if (currorder == 1 || \
+		(read_contexts ? (ngram_prefix in is_context) : \
+				 (currorder < order))) \
+	    {
+		add_trans(bo_name " " ngram_prefix, target, prob);
+
+		# Duplicate transitions out of unigram backoff for the 
+		# start-backoff-node
+		if (no_empty_bo && \
+		    node_exists(start_bo_name " " ngram_prefix) && \
+		    target != end_tag)
+		{
+		    add_trans(start_bo_name " " ngram_prefix, target, prob);
+		}
+	    } else {
+		add_trans(ngram_prefix, target, prob);
+	    }
+
+	    if (check_bows) {
+		if (currorder < order) {
+		    probs[ngram] = prob;
+		}
+		
+		if (ngram_suffix in probs && \
+		    probs[ngram_suffix] + bows[ngram_prefix] - prob > epsilon)
+		{
+		    printf "warning: ngram loses to backoff %s -> %s\n", \
+			    ngram_prefix, last_word >> "/dev/stderr";
+		}
+	    }
+	}
+}
+
+END {
+	end_grammar(grammar_name);
+}
--- a/language_model/srilm-1.7.3/utils/src/make-sub-lm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/make-sub-lm.gawk
@@ -0,0 +1,49 @@
+#!/usr/local/bin/gawk -f
+#
+# make-sub-lm --
+#	extract a lower-order backoff LM from a higher order one.
+#
+# usage: make-sub-lm maxorder=<n> lm-file > sub-lm-file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/make-sub-lm.gawk,v 1.2 1998/11/09 05:54:12 stolcke Exp $
+#
+
+BEGIN {
+	maxorder=2;
+}
+NF==0 {
+	print; next;
+}
+/^ngram *[0-9][0-9]*=/ {
+	order = substr($2,1,index($2,"=")-1);
+	if (order <= maxorder) print;
+	next;
+}
+/^\\[0-9]-grams:/ {
+	currorder=substr($0,2,1);
+	if (currorder <= maxorder) {
+		print;
+	} else {
+		print "\n\\end\\";
+		exit;
+	}
+	next;
+}
+/^\\/ {
+	print; next;
+}
+currorder {
+	if (currorder < maxorder) {
+		print;
+	} else if (currorder == maxorder) {
+		#
+		# delete backoff weight for maximal ngram
+		#
+		if (NF == currorder + 2) {
+			$NF = "";
+		}
+		print;
+	}
+	next;
+}
+{ print }
--- a/language_model/srilm-1.7.3/utils/src/merge-batch-counts
+++ b/language_model/srilm-1.7.3/utils/src/merge-batch-counts
@@ -0,0 +1,133 @@
+#!/bin/sh
+#
+# merge-batch-counts --
+#	combine batch count files into a single count file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/merge-batch-counts,v 1.9 2013/03/19 18:37:51 stolcke Exp $
+# 
+
+merge_options=
+merge_size=2
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+    -float-counts)
+	merge_options=-float-counts
+	shift
+	;;
+    -l)	merge_size=$2
+	shift; shift
+	;;
+    *)	break
+	;;
+    esac
+done
+
+if [ $# -lt 1 ]; then
+	echo "usage: $0 [-float-counts] [-l N] countdir [file-list | start-iter]" >&2
+	exit 2
+fi
+
+countdir=${1-./counts}
+filelist=$2
+iter=0
+
+mergedir=$countdir
+
+merger=ngram-merge
+
+newfilefile=$mergedir/newfiles$$
+
+set -e
+
+# find right xarg option
+if xargs -L1 </dev/null >/dev/null 2>&1; then
+	xargs_l=L
+else
+	xargs_l=l
+fi
+
+# make sure partially generated files are deleted
+trap 'rm -f $newfile $newfilefile $test_in $test_out; exit 1' 1 2 15
+
+# determine if ngram-merge can generate compressed files
+test_in=$mergedir/testin
+test_out=$mergedir/testout.gz
+
+echo "x 1" > $test_in
+$merger -write $test_out $test_in $test_in
+if gzip -l $test_out >/dev/null 2>&1; then
+	gz=.gz
+else
+	gz=
+fi
+rm $test_in $test_out
+
+case X$filelist in
+X[0-9]*)
+	# restart a previous run
+	what=merge
+	iter=`expr $filelist + 1`
+	infiles=$mergedir/$what-iter$iter.files
+	find $countdir/. \( \
+			-name $what-iter$filelist-\*.ngrams.gz -o \
+			-name $what-iter$filelist-\*.ngrams \) -print | \
+		sort | xargs -${xargs_l}2 /bin/echo > $infiles
+	;;
+X)
+	what=merge
+	infiles=$mergedir/$what-iter$iter.files
+	find $countdir/. \( \
+			-name \*.ngrams.gz -o \
+			-name \*.ngrams \) -print | sort | \
+		xargs -${xargs_l}2 /bin/echo > $infiles
+	;;
+X*)
+	what=`basename $filelist .files`
+	infiles=$mergedir/$what-iter$iter.files
+	cat $filelist > $infiles
+	;;
+esac
+
+numfiles=`wc -w < $infiles`
+
+while [ $numfiles -gt 1 ]; do
+	echo "ITERATION $iter, $numfiles files" >&2
+	fileno=1
+	> $newfilefile
+	while read file1 morefiles; do
+		newfile=$mergedir/$what-iter$iter-$fileno.ngrams$gz
+
+		if [ -f $newfile ]; then
+			echo "retaining old $newfile" >&2
+			echo $newfile >>$newfilefile
+		elif [ -z "$morefiles" ]; then
+			echo "linking $file1 to $newfile" >&2
+			rm -f $newfile
+			ln $file1 $newfile
+
+			# put the linked file at the top of the file list
+			# for the next iteration, to keep file sizes balanced
+			mv $newfilefile $newfilefile.old
+			echo $newfile >$newfilefile
+			cat $newfilefile.old >> $newfilefile
+			rm $newfilefile.old
+		else 
+			echo "merging $file1 $morefiles into $newfile" >&2
+			$merger $merge_options -write $newfile $file1 $morefiles
+			echo $newfile >>$newfilefile
+		fi
+		fileno=`expr $fileno + 1`
+	done < $infiles
+
+	xargs rm -f < $infiles 
+
+	iter=`expr $iter + 1`
+	infiles=$mergedir/$what-iter$iter.files
+	cat $newfilefile | xargs -${xargs_l}$merge_size /bin/echo > $infiles
+	numfiles=`wc -w < $infiles`
+done
+rm -f $newfilefile
+
+echo "final counts in `cat $infiles`" >&2
+
--- a/language_model/srilm-1.7.3/utils/src/merge-nbest.gawk
+++ b/language_model/srilm-1.7.3/utils/src/merge-nbest.gawk
@@ -0,0 +1,180 @@
+#!/usr/local/bin/gawk -f
+#
+# merge-nbest --
+#	merge hyps from multiple N-best lists into a single list
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/merge-nbest.gawk,v 1.8 2010/08/20 00:17:18 stolcke Exp $
+#
+
+BEGIN {
+	M_LN10 = 2.30258509299404568402;	# from <math.h>
+	logINF = -320;
+	bytelogscale = M_LN10 * 10000.5 / 1024.0;
+
+	use_orig_hyps = 1;
+	add_scores = 0;
+	last_nbestformat = -1;
+
+	nbestmagic1 = "NBestList1.0";
+	nbestmagic2 = "NBestList2.0";
+	pause = "-pau-";
+
+	max_nbest = 0;
+	multiwords = 0;
+	multichar = "_";
+	nopauses = 0;
+}
+
+function log10(x) {
+	return log(x) / M_LN10;
+}
+function exp10(x) {
+	if (x < logINF) {
+		return 0;
+	} else {
+		return exp(x * M_LN10);
+	}
+}
+function addlogs(x,y) {
+    if (x<y) {
+	temp = x; x = y; y = temp;
+    }
+    return x + log10(1 + exp10(y - x));
+}
+
+function process_nbest(file) {
+        input = "exec gzip -dcf " file;
+
+	nbestformat = 0;
+	num_hyps = 0;
+
+	while ((status = (input | getline)) > 0) {
+	    if ($1 == nbestmagic1) {
+		nbestformat = 1;
+	    } else if ($1 == nbestmagic2) {
+		nbestformat = 2;
+	    } else {
+		words = "";
+		num_words = 0;
+		num_hyps ++;
+
+		if (max_nbest > 0 && num_hyps > max_nbest) {
+		    break;
+		}
+
+		if (nbestformat == 1) {
+		    for (i = 2; i <= NF; i++) {
+			words = words " " $i;
+			if ($i != pause) num_words ++;
+		    }
+		    score = substr($1, 2, length($1)-2)/bytelogscale;
+		    num_words = 1;
+		} else if (nbestformat == 2) {
+		    prev_end_time = -1;
+		    for (i = 2; i <= NF; i += 11) {
+			start_time = $(i + 3);
+			end_time = $(i + 5);
+
+			# skip tokens that are subsumed by the previous word
+			# (this eliminates phone and state symbols)
+			# XXX: due to a bug in Decipher some state tags have
+			# incorrect timemarks.  We filter them based on their
+			# token string.
+			if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
+			    words = words " " $i;
+			    if ($i != pause) num_words ++;
+			    prev_end_time = end_time;
+			}
+		    }
+		    score = substr($1, 2, length($1)-2)/bytelogscale;
+		} else {
+		    for (i = 4; i <= NF; i++) {
+			words = words " " $i;
+		    }
+		    score = $1 + 8 * $2;
+		    num_words = $3;
+		}
+
+		# resolve multiwords and eliminate pauses if so desired
+		if (multiwords) {
+			gsub(multichar, " ", words);
+		}
+		if (nopauses) {
+			gsub(" " pause, " ", words);
+		}
+
+		# if word sequence is new, record it
+		if (!(words in scores)) {
+		    scores[words] = score;
+		    hyps[words] = $0;
+		    nwords[words] = num_words;
+		} else if (add_scores) {
+		    scores[words] = addlogs(scores[words], score);
+		}
+
+	        if (last_nbestformat < 0) {
+		    last_nbestformat = nbestformat;
+		} else if (nbestformat != last_nbestformat) {
+		    use_orig_hyps = 0;
+		    last_nbestformat = nbestformat;
+		}
+	    }
+	}
+	if (status < 0) {
+		print "error opening " file >> "/dev/stderr";
+	}
+
+	close(input);
+}
+
+function output_nbest() {
+	if (!use_orig_hyps || use_orig_hyps && last_nbestformat == 1) {
+		print nbestmagic1;
+	} else if (use_orig_hyps && last_nbestformat == 2) {
+		print nbestmagic2;
+	}
+
+	for (words in scores) {
+	    if (add_scores) {
+		print scores[words], 0, nwords[words], words;
+	    } else if (use_orig_hyps) {
+		print hyps[words];
+	    } else {
+		print "(" (scores[words] * bytelogscale) ")" words;
+	    }
+	}
+}
+
+BEGIN {
+	if (ARGC < 2) {
+	    print "usage: " ARGV[0] " N-BEST1 N-BEST2 ..." \
+			    >> "/dev/stderr";
+	    exit(2);
+	}
+
+	for (arg = 1; arg < ARGC; arg ++) {
+	    if (equals = index(ARGV[arg], "=")) {
+		var = substr(ARGV[arg], 1, equals - 1);
+		val = substr(ARGV[arg], equals + 1);
+
+	        if (var == "multiwords") {
+		    multiwords = val + 0;
+	        } else if (var == "multichar") {
+		    multichar = val;
+		} else if (var == "max_nbest") {
+		    max_nbest = val + 0;
+		} else if (var == "nopauses") {
+		    nopauses = val + 0;
+		} else if (var == "use_orig_hyps") {
+		    use_orig_hyps = val + 0;
+		} else if (var == "add_scores") {
+		    add_scores = val + 0;
+		} 
+	    } else {
+	        process_nbest(ARGV[arg]);
+	    }
+	}
+
+	output_nbest();
+}
+
--- a/language_model/srilm-1.7.3/utils/src/metadb.gawk
+++ b/language_model/srilm-1.7.3/utils/src/metadb.gawk
@@ -0,0 +1,337 @@
+#!/usr/local/bin/gawk -f
+#
+# metadb --
+#	access the META-DB
+#
+# These files are subject to the SRILM Community Research License Version
+# 1.0 (the "License"); you may not use these files except in compliance
+# with the License. A copy of the License is included in the SRILM root
+# directory.  Software distributed under the License is distributed on an
+# "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
+# See the License for the specific language governing rights and
+# limitations under the License.  This software is Copyright (c) SRI
+# International, 1995-2011.  All rights reserved.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/metadb.gawk,v 1.26 2011/11/26 06:22:34 stolcke Exp $
+#
+
+function do_defines() {
+    # process all defines 
+    for (d in defines) {
+	gsub(d, defines[d]);
+    }
+
+    # remove leading and trailing whitespace from value
+    sub("^[ 	]*", "");
+    sub("[ 	]*$", "");
+}
+
+function print_error(msg) {
+    print filename ", line " lineno ": " msg >> "/dev/stderr";
+}
+
+# process an included file
+# return 1 is caller should quit reading, 0 if not
+function process_config_file(file) {
+
+    if (file in including) {
+	print "metadb INCLUDE looping through " file >> "/dev/stderr";
+	exit 2
+    }
+    including[file] = 1;
+
+    if (trace_includes) {
+	print "READING " file >> "/dev/stderr";
+    }
+
+    filename = file;
+    lineno = 0;
+    
+    while ((status = (getline < file)) > 0) {
+
+	lineno ++;
+
+	# skip comments and empty lines
+	if (NF == 0 || $1 ~ /^#/) {
+	    continue;
+	}
+
+	if ($1 == "DEFINE") {
+	    if (NF < 2) {
+		print_error("incomplete DEFINE");
+		exit 2;
+	    } else {
+		symbol = $2;
+
+		$1 = $2 = "";
+		do_defines();
+
+		defines[symbol] = $0;
+	    }
+	} else if ($1 == "SDEFINE") {
+	    if (NF < 2) {
+		print_error("incomplete SDEFINE");
+		exit 2;
+	    } else {
+		symbol = $2;
+
+		$1 = $2 = "";
+		do_defines();
+
+		# run right-hand-side as command and use output as value
+		$0 | getline defines[symbol];
+		close($0);
+	    }
+	} else if ($1 == "MDEFINE") {
+	    if (NF < 2) {
+		print_error("incomplete MDEFINE");
+		exit 2;
+	    } else if (!recursive) {
+		symbol = $2;
+
+		$1 = $2 = "";
+
+		# look up the right-hand-side as metadb key,
+		# avoiding recursive invocations
+		db_command = "metadb -recursive -config " config_file " " $0;
+		if (debug) {
+		    print "metadb: " symbol " mdefined by: " db_command  >> "/dev/stderr";
+		}
+
+		db_command | getline defines[symbol];
+		close(db_command);
+	    }
+	} else if ($1 == "UNDEF") {
+	    if (NF < 2) {
+		print_error("incomplete UNDEF");
+		exit 2;
+	    } else {
+		delete defines[$2];
+	    }
+	} else if ($1 == "INCLUDE") {
+	    if (NF < 2) {
+		print_error("missing INCLUDE filename");
+		exit 1
+	    } else {
+		$1 = "";
+		do_defines();
+
+		if (! ($0 ~ /^\//)) {
+			includefile = file;
+			sub("[^/]*$", "", includefile);
+			if (includefile) {
+				includefile = includefile $0;
+			} else {
+				includefile = $0;
+			}
+		} else {
+			includefile = $0;
+		}
+			
+		if (process_config_file(includefile)) {
+			close(file);
+			delete including[file];
+			return 1;
+		}
+		filename = file;
+
+		if (trace_includes) {
+		    print "READING " file >> "/dev/stderr";
+		}
+	    }
+	} else if ($1 == "ALIAS") {
+	    if (NF != 3 || $2 == $3) {
+		print_error("illegal ALIAS");
+		exit 2
+	    }
+
+	    if (dump_values) print $0;
+
+	    if ($2 == key) {
+		if (debug) {
+		    print "metadb: " key " redirected to " $3 >> "/dev/stderr";
+		}
+
+		# close all currently read files so they can be read again
+		# from the top
+		for (f in including) {
+		    close(f)
+		}
+
+		# forget all current file inclusions
+		delete including;
+
+		key = $3;
+		return process_config_file(config_file);
+	    }
+	} else if ($1 == "ALIAS_SUFFIX") {
+	    if (NF != 3 || $2 == $3) {
+		print_error("illegal ALIAS_SUFFIX");
+		exit 2
+	    }
+
+	    if (dump_values) print $0;
+
+	    suffix_len = length($2);
+	    key_len = length(key);
+	    key_prefix = substr(key, 1, key_len-suffix_len);
+
+	    if ($2 == substr(key, key_len-suffix_len+1) && !index(key_prefix, "_")) {
+		# close all currently read files so they can be read again
+		# from the top
+		for (f in including) {
+		    close(f)
+		}
+
+		# forget all current file inclusions
+		delete including;
+
+		old_key = key;
+		key = key_prefix $3;
+
+		if (debug) {
+		    print "metadb: " old_key " redirected to " key >> "/dev/stderr";
+		}
+
+		return process_config_file(config_file);
+	    }
+	} else if ($1 == key || dump_values) {
+	    this_key = $1;
+	    $1 = "";
+	    do_defines();
+
+	    if ($0 == "__END__") {
+		if (dump_values) {
+		    have_keys[this_key] = 1;
+		    continue;
+		} else {
+		    close(file);
+		    delete including[file];
+		    return 1;
+		}
+	    }
+
+	    if (query_mode) {
+		exit 0;
+	    } else if (dump_values) {
+		# when dumping all keys, output the first key value found
+		if (!(this_key in have_keys)) {
+		    print this_key, $0;
+		    if (!all_values) {
+			have_keys[this_key] = 1;
+		    }
+		}
+	    } else {
+		if (debug) {
+		    print "metadb: " key "=" $0 >> "/dev/stderr";
+		}
+
+		if (!error_mode || $0 != "") {
+		    key_found = 1;
+		    print;
+		}
+	    }
+
+	    if (!all_values && !dump_values) {
+		close(file);
+		delete including[file];
+		return 1;
+	    }
+	}
+    }
+    if (status < 0) {
+	print "error reading " file >> "/dev/stderr";
+	exit 2;
+    }
+    close(file);
+    delete including[file];
+    return 0;
+}
+
+function print_usage() {
+    print "usage: metadb [-options ...] key1 [key2 ...]";
+    print "-q           query mode -- check if key is defined";
+    print "-e           exit with error message if key is undefined";
+    print "-all         return multiple key values";
+    print "-dump        dump all key and values";
+    print "-includes	list included files";
+    print "-config FILE set config file (default $" db_config ")";
+}
+
+BEGIN {
+    key = "";
+    all_values = 0;
+    dump_values = 0;
+    trace_includes = 0;
+    recursive = 0;
+    db_config = "METADB_CONFIG";
+    config_file = "";
+    query_mode = 0;
+    error_mode = 0;
+    debug = ENVIRON["METADB_DEBUG"];
+    
+    for (i = 1; i < ARGC ; i ++) {
+	if (ARGV[i] == "-q") {
+	    query_mode = 1;
+	} else if (ARGV[i] == "-e") {
+	    error_mode = 1;
+	} else if (ARGV[i] == "-all") {
+	    all_values = 1;
+	} else if (ARGV[i] == "-dump") {
+	    dump_values = 1;
+	} else if (ARGV[i] == "-includes") {
+	    trace_includes = 1;
+	} else if (ARGV[i] == "-recursive") {
+	    recursive = 1;
+	} else if (ARGV[i] == "-config") {
+	    config_file = ARGV[i + 1];
+	    i ++; 
+	} else if (ARGV[i] == "-help") {
+	    print_usage();
+	    exit 0;
+	} else if (ARGV[i] ~ /^-/) {
+	    print "unknown option: " ARGV[i] >> "/dev/stderr";
+	    exit 2;
+	} else {
+	    break;
+	}
+    }
+
+    if (!config_file) {
+	if (db_config in ENVIRON) {
+	    config_file = ENVIRON[db_config];
+	} else {
+	    print db_config " not defined" >> "/dev/stderr";
+	    exit 1;
+	}
+    }
+
+    if (config_file == "") {
+	print "empty config file name" >> "/dev/stderr";
+	exit 1;
+    }
+
+    if (dump_values) {	
+	key = "";
+	process_config_file(config_file);
+    } 
+
+    for ( ; i < ARGC ; i ++) {
+	key = ARGV[i];
+
+	key_found = 0;
+	process_config_file(config_file);
+
+        if (error_mode && !key_found) {
+	    print "key \"" key "\" empty or not defined in " config_file \
+								>> "/dev/stderr";
+	    exit 1;
+	}
+    }
+
+    if (query_mode) {
+        # we only get here if nothing was found, so return with error
+	exit 1;
+    }
+}
+
--- a/language_model/srilm-1.7.3/utils/src/nbest-error
+++ b/language_model/srilm-1.7.3/utils/src/nbest-error
@@ -0,0 +1,56 @@
+#!/bin/sh
+#
+# nbest-error --
+#	compute minimum error of nbest lists
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest-error,v 1.6 2013/03/09 07:13:02 stolcke Exp $
+#
+
+if [ $# -lt 2 ]; then
+	echo "usage: $0 score-dir refs [nbest-lattice-option ...]" >&2
+	echo "    or $0 file-list refs [nbest-lattice-option ...]" >&2
+	exit 2
+fi
+
+scoredir="$1"
+refs="$2"
+shift; shift
+
+option=-nbest-error
+
+case "$*" in
+*-lattice-error*)	option= ;;
+esac
+
+if [ ! -r $scoredir ]; then
+	echo "$0: cannot access $scoredir" >&2
+	exit 1
+fi
+
+if [ ! -r $refs ]; then
+	echo "$0: cannot access $refs" >&2
+	exit 1
+fi
+
+if [ -d $scoredir ]; then
+    find $scoredir -follow \
+	 	 -type f \( -name \*.score -o \
+			    -name \*.Z -o \
+			    -name \*.gz \) \
+		    -print | sort
+else
+    cat $scoredir
+fi | \
+nbest-lattice -nbest-files - -refs $refs $option "$@" | \
+${GAWK-gawk} '
+$2 ~ /^[0-9]*$/ && $10 ~ /^[0-9]*$/ && $9 == "words" {
+	nsents ++;
+	nwords += $10;
+	nerrors += $2;
+	print;
+}
+END {
+	printf "%d sentences, %d words, %d errors (%.2f%%)\n", \
+		nsents, nwords, nerrors, 100*nerrors/nwords;
+}'
+
--- a/language_model/srilm-1.7.3/utils/src/nbest-oov-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/nbest-oov-counts.gawk
@@ -0,0 +1,96 @@
+#!/usr/local/bin/gawk -f
+#
+# nbest-oov-counts --
+#	generate OOV counts for an nbest list
+#
+# usage: nbest-oov-counts vocab=VOCAB [vocab_aliases=ALIASES] NBESTLIST > COUNTS
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest-oov-counts.gawk,v 1.2 2017/08/15 19:29:34 stolcke Exp $
+#
+
+BEGIN {
+	nbestformat = 0;
+}
+
+$1 ~ /^NBestList1\.0/ {
+	nbestformat = 1;
+	next;
+}
+
+$1 ~ /^NBestList2\.0/ {
+	nbestformat = 2;
+	next;
+}
+
+NR == 1 {
+	nwords = 0;
+	while ((getline line < vocab) > 0) {
+	    if (split(line, a) > 0) {
+		in_vocab[a[1]] = 1;
+		nwords ++;
+	    }
+	}
+	print "read " nwords " vocab words" > "/dev/stderr";
+
+	naliases = 0;
+	if (vocab_aliases) {
+	    while ((getline line < vocab_aliases) > 0) {
+		if (split(line, a) >= 2) {
+		    vocab_mapping[a[1]] = a[2];
+		    naliases ++;
+		}
+	    }
+	    print "read " naliases " vocab aliases" > "/dev/stderr";
+	}
+
+	# add default vocabulary
+	in_vocab["<s>"] = 1;
+	in_vocab["</s>"] = 1;
+	in_vocab["-pau-"] = 1;
+}
+
+function process_word(w) {
+	if (w in vocab_mapping) {
+	    word = vocab_mapping[w];
+	} else {
+	    word = w;
+	}
+    
+	if (!(word in in_vocab)) {
+	    oov_count ++;
+	}
+}
+
+NF > 1 {
+	oov_count = 0;
+
+	if (nbestformat == 1) {
+	    # for Decipher nbest format 1 we use the aggregate score only
+	    for (i = 2; i <= NF; i ++) {
+		process_word($i);
+	    }
+	} else if (nbestformat == 2) {
+	    prev_end_time = -1;
+	    for (i = 2; i <= NF; i += 11) {
+		start_time = $(i + 3);
+		end_time = $(i + 5);
+
+		# skip tokens that are subsumed by the previous word
+		# (this eliminates phone and state symbols)
+		# XXX: due to a bug in Decipher some state tags have incorrect
+		# timemarks.  We filter them based on their token string.
+		if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
+		    process_word($i);
+
+		    prev_end_time = end_time;
+		}
+	    }
+	} else {
+	    for (i = 4; i <= NF; i ++) {
+		process_word($i);
+	    }
+	}
+
+	print oov_count;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/nbest-optimize-args-from-rover-control.gawk
+++ b/language_model/srilm-1.7.3/utils/src/nbest-optimize-args-from-rover-control.gawk
@@ -0,0 +1,64 @@
+#!/usr/local/bin/gawk -f
+#
+# nbest-optimize-args-from-rover-control --
+#	Extract initial score weights and arguments from rover-control file
+#	for use with nbest-optimize
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest-optimize-args-from-rover-control.gawk,v 1.2 2017/08/16 06:34:16 stolcke Exp $
+#
+
+BEGIN {
+	num_extras = 0;
+}
+
+# skip comment or empty line
+/^##/ || /^[ 	]*$/ {
+	next;
+}
+
+# extra score file line
+$3 == "+" {
+	num_extras ++;
+	extra_dir[num_extras] = $1;
+	extra_weight[num_extras] = $2;
+	next;
+}
+
+# main system 
+{
+	system_dir = $1;
+	lm_weight = $2;
+	wt_weight = $3;
+	max_nbest = $5;
+	post_scale = $6;
+
+	weights = "1 " lm_weight " " wt_weight;
+	for (i = 1; i <= num_extras; i ++) {
+	    weights = weights " " extra_weight[i];
+	}
+
+	if (print_weights) {
+	    print weights;
+	} else if (print_dirs) {
+	    for (i = 1; i <= num_extras; i ++) {
+		print extra_dir[i];
+	    }
+	} else {
+	    # output all arguments
+
+	    if (post_scale != "" && post_scale != 0) {
+		print "-posterior-scale " post_scale;
+	    }
+	    if (max_nbest != "" && max_nbest != 0) {
+		print "-max-nbest " max_nbest;
+	    }
+
+	    print "-init-lambdas '" weights "'";
+
+	    for (i = 1; i <= num_extras; i ++) {
+		print extra_dir[i];
+	    }
+	}
+
+	num_extras = 0;
+}
--- a/language_model/srilm-1.7.3/utils/src/nbest-posteriors.gawk
+++ b/language_model/srilm-1.7.3/utils/src/nbest-posteriors.gawk
@@ -0,0 +1,184 @@
+#!/usr/local/bin/gawk -f
+#
+# nbest-posteriors --
+#	rescale the scores in an nbest list to reflect weighted posterior
+#	probabilities
+#
+# usage: nbest-posteriors [ weight=W amw=AMW lmw=LMW wtw=WTW postscale=S max_nbest=M ] NBEST-FILE
+#
+# The output is the same input NBEST-FILE with acoustic scores set to
+# the log10 of the posterior hyp proabilities and LM scores set to zero.
+# postscale=S attenuates the posterior distribution by dividing combined log 
+# scores by S (the default is S=LMW).
+#
+# If weight=W is specified the posteriors are multiplied by W.
+# (This is useful to combine multiple nbest lists in a weighted fashion).
+# The input should be in SRILM nbest-format.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest-posteriors.gawk,v 1.14 2019/02/08 14:13:35 stolcke Exp $
+#
+
+BEGIN {
+	M_LN10 = 2.30258509299404568402;
+
+	weight = 1.0;
+	amw = 1.0;
+	lmw = 8.0;
+	wtw = 0.0;
+	postscale = 0;
+	max_nbest = 0;
+
+	logINF = -320;		# log10 of smallest representable number
+	log_total_numerator = logINF;
+	bytelogscale = 1024.0 / 10000.5 / M_LN10;
+
+	nbestformat = 0;
+	noheader = 0;
+
+	# tag to identify nbest list in output_posteriors
+	nbest_tag = 1;
+}
+
+function log10(x) {
+        return log(x)/M_LN10;
+}
+function exp10(x) {
+	if (x <= logINF) {
+		return 0;
+	} else {
+		return exp(x * M_LN10);
+	}
+}
+function addlogs(x,y) {
+    if (x<y) {
+	temp = x; x = y; y = temp;
+    }
+    return x + log10(1 + exp10(y - x));
+}
+
+# by default, use posterior scale = lmw
+NR == 1 {
+	if (!postscale) {
+	    if (lmw == 0) {
+		postscale = 1.0;
+	    } else  {
+		postscale = lmw;
+	    }
+	}
+}
+
+$1 ~ /^NBestList1\.0/ {
+	nbestformat = 1;
+	if (!noheader) {
+	    # keep header in output
+	    print;
+	}
+
+	if (lmw != 0 || wtw != 0) {
+	    print "warning: cannot apply LMW or WTW to Decipher N-nbest lists" \
+								>> "/dev/stderr";
+	}
+
+	next;
+}
+
+$1 ~ /^NBestList2\.0/ {
+	nbestformat = 2;
+
+	if (!noheader) {
+	    # keep header in output
+	    print;
+	}
+
+	next;
+}
+
+NF > 1 {
+	if (max_nbest && num_hyps == max_nbest) exit;
+
+	num_hyps ++;
+
+	if (nbestformat == 1) {
+	    # for Decipher nbest format 1 we use the aggregate score only
+	    total_score = substr($1,2,length($1)-2);
+	    total_score *= bytelogscale * amw/postscale;
+	} else if (nbestformat == 2) {
+	    total_score = substr($1,2,length($1)-2);
+
+	    # compute total AC and LM scores 
+	    lm_score = 0;
+	    num_tokens = 0;
+
+	    prev_end_time = -1;
+	    for (i = 2; i <= NF; i += 11) {
+		start_time = $(i + 3);
+		end_time = $(i + 5);
+
+		# skip tokens that are subsumed by the previous word
+		# (this eliminates phone and state symbols)
+		# XXX: due to a bug in Decipher some state tags have incorrect
+		# timemarks.  We filter them based on their token string.
+		if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
+		    num_tokens ++;
+
+		    lm_score += $(i + 7);
+
+		    prev_end_time = end_time;
+		}
+	    }
+
+	    # Compute AC score from total and lm scores. This takes into
+	    # account that the recognizer might sum scores of equivalent hyps
+	    # (e.g., those differing only in pauses or pronunciations) and
+	    # reflect the summing in the total score, but not in the word AC
+	    # scores.
+	    ac_score = total_score - lm_score;
+
+	    # Note we don't eliminate pause tokens from the word count, since
+	    # the recognizer includes them in word count weighting.
+	    # (Only after LM rescoring are pauses ignored.)
+	    total_score = amw * ac_score + lmw * lm_score + wtw * num_tokens;
+	    total_score *= bytelogscale/postscale;
+	} else {
+	    total_score = (amw * $1 + lmw * $2 + wtw * $3)/postscale;
+	}
+
+	if (num_hyps == 1) {
+	    score_offset = total_score;
+	}
+
+	total_score -= score_offset;
+
+	#
+	# store posteriors and hyp words
+	#
+	log_posteriors[num_hyps] = total_score;
+	log_total_numerator = addlogs(log_total_numerator, total_score);
+
+	num_words[num_hyps] = $3;
+
+	if (nbestformat > 0) {
+	    $1 = "";
+	} else {
+	    $1 = $2 = $3 = "";
+	}
+	hyps[num_hyps] = $0;
+}
+
+END {
+	for (i = 1; i <= num_hyps; i ++) {
+	    unweighted_logpost = log_posteriors[i] - log_total_numerator;
+	    logpost = log10(weight) + unweighted_logpost;
+
+	    if (nbestformat > 0) {
+		printf "(%f) %s\n", logpost / bytelogscale, hyps[i];
+	    } else {
+		print logpost, 0, num_words[i], hyps[i];
+	    }
+
+	    if (output_posteriors) {
+		print nbest_tag, i, unweighted_logpost >> output_posteriors;
+	    }
+	}
+}
+
--- a/language_model/srilm-1.7.3/utils/src/nbest-rover
+++ b/language_model/srilm-1.7.3/utils/src/nbest-rover
@@ -0,0 +1,316 @@
+#!/bin/sh
+#
+# nbest-rover --
+#	Combine multiple nbest lists ROVER-style
+#
+# usage: nbest-rover SENTIDS CONTROL-FILE [POSTERIORS]
+#
+# where SENTIDS is list of sentence ids (filenames of nbest lists)
+#		if SENTIDS is "-" the list is inferred from the contents of
+#		the first N-best directory
+#       CONTROL-FILE describes the nbest list sets to be processed
+#	POSTERIORS is an an optional file to which word posterior probabilities
+#			are written.
+#
+# The format for CONTROL-FILE is
+#
+#	DIR1 LMW1 WTW1 W1 [ N1 [ S1 ] ]
+#	DIR2 LMW2 WTW2 W2 [ N2 [ S2 ] ]
+#	...
+#
+# Each DIRi names a directory in which nbest lists are to be found.
+# LMWi and WTWi are the rescoring weights to be used for the corresponding
+# directory.  Wi is the weight to be given to the posteriors compute from
+# the respective list. Ni are optional limits on the number N-best hyps used.
+# Si are optional posterior scaling parameters.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest-rover,v 1.43 2019/02/28 04:48:21 stolcke Exp $
+#
+
+if [ $# -lt 2 ]; then
+	echo "usage: $0 [ sentid-list | - ] control-file [posteriors [nbest-lattice-options]]"  >&2
+	exit 2
+fi
+
+sentids=$1
+control=$2
+shift; shift
+
+# for new-style gnu sort
+_POSIX2_VERSION=199209
+export _POSIX2_VERSION
+
+amw=1
+default_lmw=8
+default_wtw=0
+default_scale=0
+default_max_nbest=0
+default_weight=1
+
+mesh_option=-use-mesh
+
+if [ $# -gt 0 ]; then
+    posteriors=$1
+    shift
+else 
+    posteriors=/dev/null
+fi
+
+lattice_dir=
+posteriors_dir=
+nbest_dir=
+ref_posteriors=
+filter_script=cat
+missing_nbest=
+use_nbest_scripts=
+debug_level=0
+null_nbest=${TMPDIR-/tmp}/$$null.nbest
+
+# collect remaining options (mostly to pass them to nbest-lattice)
+while [ $# -gt 0 ]; do
+    case "$1" in
+    -debug)	debug_level=$2
+		shift; shift ;;
+    -amw)	amw=$2;
+		shift; shift ;;
+    -write-dir)	lattice_dir=$2
+		options="$options $1 $2"
+		shift; shift ;;
+    -write-nbest-dir)
+		nbest_dir=$2
+		options="$options $1 $2"
+		shift; shift ;;
+    -write-nbest-posteriors)
+		posteriors_dir=$2;
+		shift; shift ;;
+    -write-ref-posteriors)
+		ref_posteriors=$2;
+		options="$options -record-hyps"
+		shift; shift ;;
+    -no-mesh)	mesh_option= ;
+		shift ;;
+    -wer)	# -wer implies -no-mesh
+		mesh_option= ;
+		options="$options $1"
+		shift ;;
+    -missing-nbest)
+		echo "0 0 0" > $null_nbest
+		missing_nbest=1
+		use_nbest_scripts=1
+		shift ;;
+    -nbest-backtrace)
+		# Decipher2 format with backtrace info
+		# -- need to use old nbest helper scripts
+		options="$options $1"
+		use_nbest_scripts=1
+		shift ;;
+    -nbest-backtrace-times-only)
+		# Decipher 2 format - but only timing
+		# information is needed
+		helper_options="-nbest-backtrace -decipher-nbest"
+		options="$options -nbest-backtrace"
+		shift ;;
+    -filter)	filter_script="$2";
+		shift; shift ;;
+    *)		options="$options $1"
+		shift ;;
+    esac
+done
+
+> $posteriors
+
+tmpdir=${TMPDIR-/tmp}
+tmp_post=$tmpdir/post$$
+tmp_sentids=$tmpdir/sentids$$
+tmp_nbest_dir=$tmpdir/nbest.dir$$
+tmp_post_dir=$tmpdir/post.dir$$
+tmp_lat_dir=$tmpdir/lat.dir$$
+
+trap "rm -rf $tmp_post $tmp_sentids $tmp_nbest_dir $tmp_post_dir $tmp_lat_dir $null_nbest; exit" 0 1 2 15
+
+mkdir -p $tmp_nbest_dir $tmp_post_dir $tmp_lat_dir
+
+#
+# make sentid list if none was specified
+#
+if [ "$sentids" = "-" ]; then
+	${GAWK-gawk} '{ print $1; exit }' $control | xargs ls | \
+	sed -e 's,.*/,,' -e 's,\.gz$,,' -e 's,\.score$,,' | \
+	sort > $tmp_sentids
+else
+	sort +0 -1 $sentids > $tmp_sentids
+fi
+
+set -e
+
+#
+# create lattice output directory if needed
+#
+if [ -n "$lattice_dir" ]; then
+    mkdir -p "$lattice_dir"
+elif [ -n "$ref_posteriors" ]; then
+    lattice_dir=$tmp_lat_dir
+    options="$options -write-dir $lattice_dir"
+fi
+
+if [ -n "$nbest_dir" ]; then
+    mkdir -p "$nbest_dir"
+fi
+
+if [ -n "$posteriors_dir" ]; then
+    mkdir -p "$posteriors_dir"
+elif [ -n "$ref_posteriors" ]; then
+    posteriors_dir=$tmp_post_dir
+fi
+
+cat $tmp_sentids | \
+while read sentid refwords
+do
+	extra_weights=
+	extra_scores=
+	extra_wts_and_scores=
+
+	noheader=0
+
+	nbest_tag=1
+
+	if [ -n "$posteriors_dir" ]; then
+		posteriors_file=$posteriors_dir/$sentid
+		> $posteriors_file
+	else
+		posteriors_file=
+	fi
+
+	if [ -n "$use_nbest_scripts" ]; then
+	    # handle DOS EOL, comment and empty lines
+	    sed -e 's,
+$,,' -e '/^##/d' -e '/^[ 	]*$/d' $control | \
+	    while read dir lmw wtw weight max_nbest scale rest 
+	    do
+		if [ "$wtw" = "+" ]; then
+		    if [ -f $dir/$sentid.gz ]; then
+			    extra_scores="$extra_scores $dir/$sentid.gz"
+			    extra_wts_and_scores="$extra_wts_and_scores $lmw $dir/$sentid.gz"
+		    elif [ -f $dir/$sentid ]; then
+			    extra_scores="$extra_scores $dir/$sentid"
+			    extra_wts_and_scores="$extra_wts_and_scores $lmw $dir/$sentid"
+		    else
+			    echo "$dir/$sentid" is missing >&2
+			    continue
+		    fi
+
+		    extra_weights="$extra_weights $lmw"
+		    continue
+		else
+		    if [ -f $dir/$sentid ]; then
+			    nbest_file=$dir/$sentid
+		    elif [ -f $dir/$sentid.gz ]; then
+			    nbest_file=$dir/$sentid.gz
+		    elif [ -f $dir/$sentid.score.gz ]; then
+			    nbest_file=$dir/$sentid.score.gz
+		    elif [ -f $dir/$sentid.score ]; then
+			    nbest_file=$dir/$sentid.score
+		    else
+			    echo -n "$dir/$sentid.score.gz is missing" >&2
+			    extra_weights=
+			    extra_scores=
+			    extra_wts_and_scores=
+
+			    if [ -n "$missing_nbest" ]; then
+				echo " - using empty hyp" >&2
+				nbest_file=$null_nbest
+			    else
+				echo "" >&2
+				continue
+			    fi
+		    fi
+
+		    if [ "$weight" = "=" ]; then
+			    weight=$last_weight
+		    else
+			    last_weight=$weight
+		    fi
+
+		    if [ -n "$extra_weights" -o "$amw" != 1 ]; then
+			    combine-acoustic-scores \
+				-v "weights=$amw $extra_weights" \
+				-v max_nbest=${max_nbest:-$default_max_nbest} \
+				$nbest_file $extra_scores
+		    else
+			    gzip -dcf $nbest_file
+		    fi | \
+		    nbest-posteriors noheader=$noheader \
+				lmw=${lmw:-$default_lmw} \
+				wtw=${wtw:-$default_wtw} \
+				weight=${weight:-$default_weight} \
+				max_nbest=${max_nbest:-$default_max_nbest} \
+				postscale=${scale:-$default_scale} \
+				nbest_tag=$nbest_tag \
+				output_posteriors=$posteriors_file
+
+		    extra_weights=
+		    extra_scores=
+		    extra_wts_and_scores=
+		    noheader=1
+		    nbest_tag=`expr $nbest_tag + 1`
+		fi
+	    done
+	else # use helper tool
+	    nbest-rover-helper -debug $debug_level \
+				-sentid $sentid \
+				-rover-control $control \
+				-max-nbest $default_max_nbest \
+				-rescore-amw $amw \
+				-rescore-lmw $default_lmw \
+				-rescore-wtw $default_wtw \
+				-posterior-weight $default_weight \
+				-posterior-scale $default_scale \
+				-write-posteriors "$posteriors_file" \
+				$helper_options
+	fi | \
+	eval "$filter_script" \
+		> $tmp_nbest_dir/$sentid
+
+	if [ -n "$posteriors_file" ]; then
+		gzip -f $posteriors_file
+	fi
+
+	echo $tmp_nbest_dir/$sentid 
+done | \
+nbest-lattice -nbest-files - \
+	$mesh_option \
+	-rescore-lmw 0 -rescore-wtw 0 \
+	-posterior-amw 0 -posterior-lmw 0 -posterior-wtw 0 \
+	-debug 2 $options 2>$tmp_post | \
+while read sentid hyp
+do
+	# delete tmp nbest lists to avoid huge data accumulation
+	if [ "$sentid" != "$last_sentid" ]; then
+	    rm -f $tmp_nbest_dir/$sentid
+	    last_sentid=$sentid
+	fi
+
+	echo "$sentid $hyp"
+done
+
+if [ -n "$ref_posteriors" ]; then
+	> $ref_posteriors
+
+	cat $tmp_sentids | \
+	while read sentid refwords
+	do
+		if [ -f $lattice_dir/$sentid.gz ]; then
+			suffix=.gz
+		else
+			suffix=
+		fi
+		gzip -dcf $lattice_dir/$sentid$suffix | \
+		find-reference-posteriors sentid=$sentid \
+		   posteriors_file=$posteriors_dir/$sentid$suffix >> $ref_posteriors
+	done
+fi
+
+# extract posteriors to file; output error messages; ignore others
+${GAWK-gawk} '$2 == "post" { $2 = ""; print; next; }
+      $2 == "err" { next; }
+      { print > "/dev/stderr"; }' $tmp_post > $posteriors
--- a/language_model/srilm-1.7.3/utils/src/nbest-rover-helper.cc
+++ b/language_model/srilm-1.7.3/utils/src/nbest-rover-helper.cc
@@ -0,0 +1,526 @@
+/*
+ * nbest-rover-helper --
+ *	Preprocess nbest lists for nbest-rover
+ */
+
+#ifndef lint
+static char Copyright[] = "Copyright (c) 1995-2010 SRI International, 2017 Andreas Stolcke, Microsoft Corp.  All Rights Reserved.";
+static char RcsId[] = "@(#)$Id: nbest-rover-helper.cc,v 1.10 2019/09/09 23:13:15 stolcke Exp $";
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <locale.h>
+#include <assert.h>
+#include <math.h>
+#ifndef _MSC_VER
+# include <unistd.h>
+#endif
+
+#include "option.h"
+#include "version.h"
+#include "File.h"
+
+#include "Prob.h"
+#include "Vocab.h"
+#include "NBest.h"
+#include "RefList.h"
+#include "VocabMultiMap.h"
+#include "MultiwordVocab.h"	// for MultiwordSeparator
+#include "Array.cc"
+#include "MStringTokUtil.h"
+
+#define DEBUG_ERRORS		1
+#define DEBUG_POSTERIORS	2
+
+/*
+ * default value for posterior* weights to indicate they haven't been set
+ */
+static int version = 0;
+static unsigned debug = 0;
+static char *vocabFile = 0;
+static char *vocabAliasFile = 0;
+static int toLower = 0;
+static int multiwords = 0;
+static const char *multiChar = MultiwordSeparator;
+static int nbestBacktrace = 0;
+static char *rescoreFile = 0;
+static char *nbestFiles = 0;
+static char *roverControlFile = 0;
+static char *sentid = 0;
+static char *writeNbestFile = 0;
+static char *writeNbestDir = 0;
+static int writeDecipherNbest = 0;
+static unsigned maxNbest = 0;
+static double rescoreAMW = 1.0;
+static double rescoreLMW = 8.0;
+static double rescoreWTW = 0.0;
+static double posteriorScale = 0.0;
+static double posteriorWeight = 1.0;
+static int noPosteriors = 0;
+static char *writePosteriors = 0;
+static int nbestTag = 1;
+static int optRest;
+
+static Option options[] = {
+    { OPT_TRUE, "version", &version, "print version information" },
+    { OPT_UINT, "debug", &debug, "debugging level" },
+    { OPT_STRING, "vocab", &vocabFile, "vocab file" },
+    { OPT_STRING, "vocab-aliases", &vocabAliasFile, "vocab alias file" },
+    { OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
+    { OPT_TRUE, "multiwords", &multiwords, "split multiwords in N-best hyps" },
+    { OPT_STRING, "multi-char", &multiChar, "multiword component delimiter" },
+    { OPT_TRUE, "nbest-backtrace", &nbestBacktrace, "read backtrace info from N-best lists" },
+
+    { OPT_STRING, "rescore", &rescoreFile, "hyp stream input file to rescore" },
+    { OPT_STRING, "nbest", &rescoreFile, "same as -rescore" },
+    { OPT_STRING, "nbest-files", &nbestFiles, "list of n-best filenames" },
+    { OPT_STRING, "rover-control", &roverControlFile, "process nbest-rover control file" },
+    { OPT_STRING, "sentid", &sentid, "sentence ID string for nbest-rover control file" },
+    { OPT_STRING, "write-nbest", &writeNbestFile, "output n-best list" },
+    { OPT_STRING, "write-nbest-dir", &writeNbestDir, "output n-best directory" },
+    { OPT_TRUE, "decipher-nbest", &writeDecipherNbest, "output Decipher n-best format" },
+    { OPT_UINT, "max-nbest", &maxNbest, "maximum number of hyps to consider" },
+    { OPT_FLOAT, "rescore-amw", &rescoreAMW, "rescoring AM weight" },
+    { OPT_FLOAT, "rescore-lmw", &rescoreLMW, "rescoring LM weight" },
+    { OPT_FLOAT, "rescore-wtw", &rescoreWTW, "rescoring word transition weight" },
+    { OPT_FLOAT, "posterior-scale", &posteriorScale, "divisor for log posterior estimates" },
+    { OPT_FLOAT, "posterior-weight", &posteriorWeight, "overall weight of posterior probabilities" },
+
+    { OPT_TRUE, "no-posteriors", &noPosteriors, "do not compute posterior probabilties (acoustic rescoring only)" },
+    { OPT_STRING, "write-posteriors", &writePosteriors, "append posteriors probs to file" },
+    { OPT_INT, "nbest-tag", &nbestTag, "subsystem tag number for posterior dump" },
+    { OPT_REST, "-", &optRest, "indicate end of option list" },
+    { OPT_DOC, 0, 0, "following options, an alternating list of weights and score files/directories" },
+};
+
+#ifdef _MSC_VER
+# include <errno.h>
+# include <sys/stat.h>
+
+/*
+ * Emulate access(2) in Windows
+ */
+#define F_OK    0
+#define R_OK    4
+#define W_OK    2
+#define X_OK    1
+
+int
+access(const char *path, int mode)
+{
+    struct _stat buf;
+
+    if (_stat(path, &buf) < 0) {
+	return -1;
+    } else {
+	if (mode & R_OK && !(buf.st_mode & _S_IREAD)) {
+	    errno = EPERM;
+	    return -1;
+	}
+	if (mode & W_OK && !(buf.st_mode & _S_IWRITE)) {
+	    errno = EPERM;
+	    return -1;
+	}
+	if (mode & X_OK && !(buf.st_mode & _S_IEXEC)) {
+	    errno = EPERM;
+	    return -1;
+	}
+	return 0;
+    }
+}
+#endif /* _MSC_VER */
+
+
+/*
+ * Read a list of scores from file
+ */
+Boolean
+readScores(const char *filename, unsigned numHyps, unsigned maxN, Array<LogP2> &scores)
+{
+    unsigned numScores = 0;
+
+    File file(filename, "r");
+    char *line;
+
+    while ((line = file.getline())) {
+	LogP2 score;
+
+	if (parseLogP(line, score)) {
+	    scores[numScores ++] = score;
+	} else {
+	    file.position() << "bad score value\n";
+	    return false;
+	}
+
+	if (maxN > 0 && numScores == maxN) break;
+    }
+
+    if (numScores == numHyps || (maxN > 0 && numScores == maxN)) {
+	return true;
+    } else {
+	file.position() << "mismatched number of scores -- expecting "
+			<< numHyps << endl;
+	return false;
+    }
+}
+
+/*
+ * Process a single N-best list
+ */
+void
+processNbest(Vocab &vocab, const char *sentid,
+			const char *nbestFile, unsigned maxN, Prob weight,
+			double LMW, double WTW, double postScale,
+			unsigned nScores, double scoreWeights[], const char *scoreFiles[], 
+			File &outNbestFile, unsigned tag)
+{
+    /*
+     * Process nbest list
+     */
+    NBestList nbestList(vocab, maxN, multiwords ? multiChar : 0, nbestBacktrace);
+    nbestList.debugme(debug);
+
+    /*
+     * Posterior scaling:  if not specified (= 0.0) use LMW for
+     * backward compatibility.
+     */
+    if (postScale == 0.0) {
+	postScale = (LMW == 0.0) ? 1.0 : LMW;
+    }
+
+    if (debug > 0) {
+	cerr << "PROCESSING " << nbestFile
+	     << " maxn = " << maxN
+	     << " weight = " << weight
+	     << " lmw = " << LMW << " wtw = " << WTW
+	     << " scale = " << postScale
+	     << " extras =";
+	for (unsigned i = 0; i < nScores; i ++) {
+	    cerr << " " << scoreWeights[i]
+		 << " " << scoreFiles[i];
+	}
+	cerr << endl;
+    }
+
+    if (nbestFile) {
+	File input(nbestFile, "r");
+
+	if (!nbestList.read(input)) {
+	    cerr << "format error in nbest list\n";
+	    exit(1);
+	}
+    } else {
+	File input(stdin);
+
+	if (!nbestList.read(input)) {
+	    cerr << "format error in nbest list\n";
+	    exit(1);
+	}
+    }
+
+    /*
+     * Apply AM weight
+     */
+    if (rescoreAMW != 1.0) {
+	for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
+	    nbestList.getHyp(i).acousticScore *= rescoreAMW;
+	}
+    }
+
+    /*
+     * Add extra scores into AM score
+     */
+    for (unsigned j = 0; j < nScores; j ++) {
+	if (scoreWeights[j] != 0.0) {
+	    Array<LogP2> extraScores;
+
+	    if (!readScores(scoreFiles[j], nbestList.numHyps(), maxN, extraScores)) {
+		exit(1);
+	    }
+
+	    for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
+		nbestList.getHyp(i).acousticScore += scoreWeights[j] * extraScores[i];
+	    }
+	}
+    }
+
+    if (!noPosteriors) {
+	/*
+	 * compute log posteriors
+	 */
+	nbestList.computePosteriors(LMW, WTW, postScale, 1.0, true);
+	LogP logWeight = ProbToLogP(weight);
+
+	File posteriorFile;
+	if (writePosteriors && *writePosteriors) {
+	    posteriorFile.reopen(writePosteriors, "a");
+	}
+
+	/*
+ 	 * Encode log posteriors as acoustic scores, for output purposes
+ 	 * Also, dump posterior to a separate file if requested
+ 	 */
+	for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
+	    nbestList.getHyp(i).acousticScore = nbestList.getHyp(i).posterior;
+	    nbestList.getHyp(i).languageScore = 0.0;
+
+	    nbestList.getHyp(i).totalScore = nbestList.getHyp(i).acousticScore;
+
+	    if (writePosteriors && *writePosteriors) {
+                /* from nbest-posteriors.gawk:
+		 * 	print nbest_tag, i, unweighted_logpost >> output_posteriors;
+		 */
+		posteriorFile.fprintf("%d %d %.*lg\n", tag, i+1,
+				Prob_Precision, (double)nbestList.getHyp(i).posterior);
+            }
+	    nbestList.getHyp(i).acousticScore += logWeight;
+	}
+    }
+
+    nbestList.write(outNbestFile, writeDecipherNbest);
+}
+
+int
+main (int argc, char *argv[])
+{
+    setlocale(LC_CTYPE, "");
+    setlocale(LC_COLLATE, "");
+
+    argc = Opt_Parse(argc, argv, options, Opt_Number(options),
+							OPT_OPTIONS_FIRST);
+
+    /*
+     *  Ensure arguments are in pairs (weight, scorefile)
+     */
+    if ((argc-1) % 2 == 1) {
+	cerr << "number of arguments is not even (alternating weights and score files)\n";
+	exit(2);
+    }
+    unsigned nExtraScores = (argc-1)/2;
+
+    makeArray(double, scoreWeights, nExtraScores);
+    makeArray(const char *, scoreFiles, nExtraScores);
+
+    for (unsigned i = 0; i < nExtraScores; i ++) {
+	if (sscanf(argv[2*i + 1], "%lf", &scoreWeights[i]) != 1) {
+	    cerr << "bad score weight " << argv[2*i + 1] << endl;
+	    exit(2);
+	}
+	scoreFiles[i] = argv[2*i + 2];
+    }
+
+    if (version) {
+	printVersion(RcsId);
+	exit(0);
+    }
+
+    Vocab vocab;
+
+    vocab.toLower() = toLower ? true : false;
+
+    if (vocabFile) {
+	File file(vocabFile, "r");
+	vocab.read(file);
+    }
+
+    if (vocabAliasFile) {
+	File file(vocabAliasFile, "r");
+	vocab.readAliases(file);
+    }
+
+    File outFile(stdout);
+
+    /*
+     * Process single nbest file
+     */
+    if (rescoreFile) {
+	if (writeNbestFile) {
+	    outFile.reopen(writeNbestFile, "w");
+	}
+
+	processNbest(vocab, 0, rescoreFile, maxNbest, posteriorWeight,
+		     rescoreLMW, rescoreWTW, posteriorScale,
+		     nExtraScores, scoreWeights, scoreFiles,
+		     outFile, nbestTag);
+
+	if (writeNbestFile) {
+	    outFile.close();
+	}
+    }
+
+    /*
+     * Process list of nbest filenames
+     */
+    if (nbestFiles) {
+
+	File file(nbestFiles, "r");
+	char *line;
+	while ((line = file.getline())) {
+	    char *strtok_ptr = NULL;
+	    char *fname = MStringTokUtil::strtok_r(line, wordSeparators, &strtok_ptr);
+	    if (!fname) continue;
+
+	    RefString sentid = idFromFilename(fname);
+
+	    /*
+	     * Construct score file names from directory path and sentid
+	     */
+	    makeArray(char *, scoreFileNames, nExtraScores);
+
+	    for (unsigned i = 0; i < nExtraScores; i ++) {
+		scoreFileNames[i] = new char[strlen(scoreFiles[i]) + 1 + strlen(sentid) + strlen(GZIP_SUFFIX) + 1];
+
+		sprintf(scoreFileNames[i], "%s/%s%s", scoreFiles[i], sentid,
+								GZIP_SUFFIX);
+	    }
+
+	    /*
+	     * Construct output file names from directory path and sentid
+	     */
+	    makeArray(char, writeNbestName,
+		      (writeNbestDir ? strlen(writeNbestDir) : 0) + 1
+				+ strlen(sentid) + strlen(GZIP_SUFFIX) + 1);
+
+	    if (writeNbestDir) {
+		sprintf(writeNbestName, "%s/%s%s", writeNbestDir, sentid, GZIP_SUFFIX);
+
+		outFile.reopen(writeNbestName, "r");
+	    }
+
+	    processNbest(vocab, sentid, fname, maxNbest, posteriorWeight,
+			 rescoreLMW, rescoreWTW, posteriorScale,
+			 nExtraScores, scoreWeights, (const char **)(char **)scoreFileNames,
+			 outFile, nbestTag);
+
+	    if (writeNbestDir) {
+		outFile.close();
+	    }
+
+	    for (unsigned i = 0; i < nExtraScores; i ++) {
+		delete [] scoreFileNames[i];
+	    }
+	}
+    }
+
+    /*
+     * Process rover control file
+     */
+    if (roverControlFile) {
+	if (!sentid) {
+	    cerr << "no -sentid specified with rover control file\n";
+	    exit(2);
+	}
+
+	File roverControl(roverControlFile, "r");
+
+	if (writeNbestFile) {
+	    outFile.reopen(writeNbestFile, "w");
+	}
+
+	Array<char *> extraScores;
+	Array<double> extraWeights;
+	unsigned nExtraScores = 0;
+	Prob lastWeight = 1.0;
+
+	const char *scoreSuffix = ".score";
+
+	char *line;
+
+	while ((line = roverControl.getline())) {
+	    char scoreDir[256], plus[10];
+	    double lmw = rescoreLMW, wtw = rescoreWTW, postScale = posteriorScale;
+	    unsigned maxN = maxNbest;
+	    Prob weight = posteriorWeight;
+	    char weightStr[30];
+	    unsigned nparsed;
+
+	    /*
+	     * nbest-rover:
+	     *	read dir lmw wtw weight max_nbest scale rest
+	     */
+	    if (sscanf(line, "%255s %lf %9s", scoreDir, &lmw, plus) == 3 && strcmp(plus, "+") == 0) {
+
+		extraScores[nExtraScores] = new char[strlen(scoreDir) + 1 + strlen(sentid) + strlen(GZIP_SUFFIX) + 1];
+		sprintf(extraScores[nExtraScores], "%s/%s%s", scoreDir, sentid, GZIP_SUFFIX);
+
+		if (access(extraScores[nExtraScores], R_OK) < 0) {
+		    sprintf(extraScores[nExtraScores], "%s/%s", scoreDir, sentid);
+
+		    if (access(extraScores[nExtraScores], R_OK) < 0) {
+			roverControl.position() << "no score file for sentid " << sentid << endl;
+
+			for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
+			nExtraScores = 0;
+			continue;
+		    }
+		}
+		extraWeights[nExtraScores] = lmw;
+
+		nExtraScores ++;
+
+	    } else if ((nparsed = sscanf(line, "%255s %lf %lf %29s %u %lf", scoreDir, &lmw, &wtw, weightStr, &maxN, &postScale)) >= 1) {
+		char *nbestFile = new char[strlen(scoreDir) + 1 + strlen(sentid) + strlen(scoreSuffix) + strlen(GZIP_SUFFIX) + 1];
+
+		sprintf(nbestFile, "%s/%s%s", scoreDir, sentid, GZIP_SUFFIX);
+		if (access(nbestFile, R_OK) < 0) {
+		    sprintf(nbestFile, "%s/%s", scoreDir, sentid);
+
+		    if (access(nbestFile, R_OK) < 0) {
+			sprintf(nbestFile, "%s/%s%s%s", scoreDir, sentid, scoreSuffix, GZIP_SUFFIX);
+
+			if (access(nbestFile, R_OK) < 0) {
+			    sprintf(nbestFile, "%s/%s%s", scoreDir, sentid, scoreSuffix);
+
+			    if (access(nbestFile, R_OK) < 0) {
+				roverControl.position() << "no nbest file for sentid " << sentid << endl;
+
+				for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
+				nExtraScores = 0;
+				delete [] nbestFile;
+				continue;
+			    }
+			}
+		    }
+		}
+
+		if (nparsed >= 4 && strcmp(weightStr, "=") == 0) {
+		    weight = lastWeight;
+		} else {
+		    if (!parseProb(weightStr, weight)) {
+			roverControl.position() << "bad weight value " << weightStr << endl;
+			weight = 0.0;
+		    }
+		    lastWeight = weight;
+		}
+
+
+		
+		/*
+		 * No combine all the files
+		 */
+		processNbest(vocab, sentid, nbestFile, maxN, weight,
+			     lmw, wtw, postScale,
+			     nExtraScores, extraWeights, (const char **)(char **)extraScores,
+			     outFile, nbestTag);
+
+		for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
+		nExtraScores = 0;
+		delete [] nbestFile;
+
+		nbestTag ++;
+	    } else {
+		roverControl.position() << "bad format in control file\n";
+
+		exit(1);
+	    }
+	}
+
+	if (writeNbestFile) {
+	    outFile.close();
+	}
+    }
+
+    exit(0);
+}
--- a/language_model/srilm-1.7.3/utils/src/nbest-vocab.gawk
+++ b/language_model/srilm-1.7.3/utils/src/nbest-vocab.gawk
@@ -0,0 +1,59 @@
+#!/usr/local/bin/gawk -f
+#
+# nbest-vocab --
+#	extract vocabulary used in nbest lists
+#
+# usage: nbest-vocab NBEST-FILE ... > VOCAB
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest-vocab.gawk,v 1.2 2003/03/18 00:55:07 stolcke Exp $
+#
+
+BEGIN {
+	nbestformat = 0;
+}
+
+$1 ~ /^NBestList1\.0/ {
+	nbestformat = 1;
+	next;
+}
+
+$1 ~ /^NBestList2\.0/ {
+	nbestformat = 2;
+	next;
+}
+
+NF > 1 {
+	if (nbestformat == 1) {
+	    # for Decipher nbest format 1 we use the aggregate score only
+	    for (i = 2; i <= NF; i ++) {
+		is_word[$i] = 1;
+	    }
+	} else if (nbestformat == 2) {
+	    prev_end_time = -1;
+	    for (i = 2; i <= NF; i += 11) {
+		start_time = $(i + 3);
+		end_time = $(i + 5);
+
+		# skip tokens that are subsumed by the previous word
+		# (this eliminates phone and state symbols)
+		# XXX: due to a bug in Decipher some state tags have incorrect
+		# timemarks.  We filter them based on their token string.
+		if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
+		    is_word[$i] = 1;
+
+		    prev_end_time = end_time;
+		}
+	    }
+	} else {
+	    for (i = 4; i <= NF; i ++) {
+		is_word[$i] = 1;
+	    }
+	}
+}
+
+END {
+	for (word in is_word) {
+		print word;
+	}
+}
+
--- a/language_model/srilm-1.7.3/utils/src/nbest-words.gawk
+++ b/language_model/srilm-1.7.3/utils/src/nbest-words.gawk
@@ -0,0 +1,55 @@
+#!/usr/local/bin/gawk -f
+#
+# nbest-words --
+#	extract words only nbest lists
+#
+# usage: nbest-words NBEST-FILE ... 
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest-words.gawk,v 1.1 2016/04/29 04:00:08 stolcke Exp $
+#
+
+BEGIN {
+	nbestformat = 0;
+}
+
+$1 ~ /^NBestList1\.0/ {
+	nbestformat = 1;
+	next;
+}
+
+$1 ~ /^NBestList2\.0/ {
+	nbestformat = 2;
+	next;
+}
+
+NF > 1 {
+	words = "";
+
+	if (nbestformat == 1) {
+	    for (i = 2; i <= NF; i ++) {
+		words = words " " $i;
+	    }
+	} else if (nbestformat == 2) {
+	    prev_end_time = -1;
+	    for (i = 2; i <= NF; i += 11) {
+		start_time = $(i + 3);
+		end_time = $(i + 5);
+
+		# skip tokens that are subsumed by the previous word
+		# (this eliminates phone and state symbols)
+		# XXX: due to a bug in Decipher some state tags have incorrect
+		# timemarks.  We filter them based on their token string.
+		if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
+		    words = words " " $i
+		    prev_end_time = end_time;
+		}
+	    }
+	} else {
+	    for (i = 4; i <= NF; i ++) {
+		words = words " " $i;
+	    }
+	}
+	print words;
+}
+
+
--- a/language_model/srilm-1.7.3/utils/src/nbest2-to-nbest1.gawk
+++ b/language_model/srilm-1.7.3/utils/src/nbest2-to-nbest1.gawk
@@ -0,0 +1,37 @@
+#!/usr/local/bin/gawk -f
+#
+# nbest2-to-nbest1 --
+#	Convert Decipher NBestList2.0 format to NBestList1.0 format
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/nbest2-to-nbest1.gawk,v 1.4 2004/11/02 02:00:35 stolcke Exp $
+#
+BEGIN {
+	magic1 = "NBestList1.0";
+	magic2 = "NBestList2.0";
+}
+NR == 1 {
+	if ($0 != magic2) {
+		print "Input not in " magic2 " format" >> "/dev/stderr";
+		exit 1;
+	}
+	print magic1;
+	next;
+}
+{
+	prev_end_time = -1;
+	line = $1;
+	for (i = 2; i <= NF; i += 11) {
+		start_time = $(i + 3);
+		end_time = $(i + 5);
+
+		# skip tokens that are subsumed by the previous word
+		# (this eliminates phone and state symbols)
+		# XXX: due to a bug in Decipher some state tags have incorrect
+		# timemarks.  We filter them based on their token string.
+		if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
+			line = line " " $i;
+			prev_end_time = end_time;
+		}
+	}
+	print line;
+}
--- a/language_model/srilm-1.7.3/utils/src/pfsg-from-ngram
+++ b/language_model/srilm-1.7.3/utils/src/pfsg-from-ngram
@@ -0,0 +1,23 @@
+#!/bin/sh
+#
+# pfsg-from-ngram --
+#	Convert a bigram or trigram into a Decipher PFSG
+#
+# This is a wrapper that takes care of
+# - eliminating low probability transitions that the recognizer would never use
+# - renormalizing the LM
+# - converting to PFSG
+# - adding pauses between words
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-from-ngram,v 1.3 2000/02/04 00:20:32 stolcke Exp $
+#
+
+# get LM from first argument, pass rest to ngram
+# default LM is stdin
+lm=${1--}
+test $# -gt 0 && shift
+
+ngram -debug 1 -prune-lowprobs -lm "$lm" "$@" -write-lm - | \
+make-ngram-pfsg | \
+add-pauses-to-pfsg
+
--- a/language_model/srilm-1.7.3/utils/src/pfsg-to-dot.gawk
+++ b/language_model/srilm-1.7.3/utils/src/pfsg-to-dot.gawk
@@ -0,0 +1,87 @@
+#!/usr/local/bin/gawk -f
+#
+# pfsg-to-dot --
+#	Generate dot(1) graph description from PFSG
+#
+# usage: pfsg-to-dot [show_probs=1] [show_nums=1] file.pfsg > file.dot
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-to-dot.gawk,v 1.5 2003/07/10 21:09:15 stolcke Exp $
+#
+BEGIN {
+	show_probs = 0;
+	show_logs = 0;
+	show_nums = 0;
+	in_a_pfsg = 0;
+
+	logscale = 10000.5;
+}
+
+function bytelog2prob(p) {
+	x = p / logscale;
+	if (x < -7e2) {
+	    return 0;
+	} else {
+	    return exp(x);
+	}
+}
+
+function bytelog2log10(p) {
+	return p / logscale / 2.30258509299404568402;
+}
+
+$1 == "name" {
+	name = $2;
+
+	# handle repeated PFSGs in the same file
+	if (in_a_pfsg)
+	       print "} digraph \"" name "\" {";
+	else
+	       print "digraph \"" name "\" {";
+	  
+	print "rankdir = LR";
+	dotrans = 0;
+	in_a_pfsg = 1;
+}
+
+function node_label(w, i) {
+	if (show_nums) {
+		return w "\\n" i;
+	} else {
+		return w;
+	}
+}
+ 
+$1 == "nodes" {
+	numnodes = $2;
+	for (i = 0; i < numnodes; i ++) {
+		print "\tnode" i " [label=\"" $(i + 3) \
+				(show_nums ? "\\n" i : "") "\"];"
+	}
+}
+$1 == "initial" {
+	i = $2;
+
+#	print "\tnode" i " [label=\"START\"];"
+}
+$1 == "final" {
+	i = $2;
+
+#	print "\tnode" i " [label=\"END\"];"
+}
+$1 == "transitions" {
+	dotrans = 1;
+	next;
+}
+dotrans && NF == 3 {
+	from = $1;
+	to = $2;
+	prob = $3;
+
+	print "\tnode" from " -> node" to \
+		(!(show_probs || show_logs) ? "" :
+			" [label=\"" (show_logs ? bytelog2log10(prob) :
+						bytelog2prob(prob)) "\"]") ";"
+}
+END {
+	print "}"
+}
--- a/language_model/srilm-1.7.3/utils/src/pfsg-to-fsm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/pfsg-to-fsm.gawk
@@ -0,0 +1,146 @@
+#!/usr/local/bin/gawk -f
+# 
+# pfsg-to-fsm --
+#	convert a Decipher PFSG to AT&T FSM format
+#
+# usage: pfsg-to-fsm [symbolfile=SYMFILE] [symbolic=1] [scale=S] file.pfsg > file.fsm
+#
+# symbolic=1		retains output word strings in the fsm file.
+# symbolfile=SYMFILE 	dump output symbol table to SYMFILE
+#			(to be used with fsmcompile|fsmdraw|fsmprint -i SYMFILE)
+# scale=S		set transition weight scaling factor to S
+#			(default -1)
+# 
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-to-fsm.gawk,v 1.16 2015-07-03 03:45:38 stolcke Exp $
+#
+BEGIN {
+	empty_output = "NULL";
+	output_symbols[empty_output] = 0;
+	numoutputs = 1;
+
+	if ("TMPDIR" in ENVIRON) {
+	    tmpdir = ENVIRON["TMPDIR"];
+	} else {
+	    tmpdir = "/tmp"
+	}
+
+	if ("pid" in PROCINFO) {
+	    pid = PROCINFO["pid"];
+	} else {
+	    getline pid < "/dev/pid";
+	}
+	tmpfile = tmpdir "/pfsg.tmp" pid;
+
+	# hack to remove tmpfile when killed
+	trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
+	print "" | trap_cmd;
+
+	symbolfile = "";
+	symbolic = 0;
+
+	scale = -1;	# scaling of transition weights
+	nofinal = 0;	# do output final node definition
+	final_output = "";
+}
+$1 == "nodes" {
+	numnodes = $2;
+
+	for (i = 0; i < numnodes; i++) {
+		node_output[i] = $(i + 3);
+
+		if (!(node_output[i] in output_symbols)) {
+			output_symbols[node_output[i]] = numoutputs++;
+		}
+	}
+
+	next;
+}
+$1 == "initial" {
+	initial_node = $2;
+
+	if (node_output[initial_node] != empty_output) {
+		print "initial node must be NULL" >> "/dev/stderr";
+		exit 1;
+	}
+	next;
+}
+$1 == "final" {
+	final_node = $2;
+
+	if (final_output) {
+		node_output[final_node] = final_output;
+		if (!(final_output in output_symbols)) {
+			output_symbols[final_output] = numoutputs++;
+		}
+	}
+	next;
+}
+
+function print_trans(from_node, to_node, cost) {
+	if (to_node == final_node && node_output[final_node] == empty_output) {
+		print from_node, scale * cost;
+	} else {
+		# PFSG bytelogs have to be negated to FSM default semiring
+		print from_node, to_node, \
+			(symbolic ? node_output[to_node] : \
+			 output_symbols[node_output[to_node]]), \
+			scale * cost;
+	}
+}
+
+function print_final() {
+	# if the final node is non-emitting, we don't need to output it
+	# at all (see print_trans above)
+	if (!nofinal && node_output[final_node] != empty_output) {
+		print final_node, 0;
+	}
+}
+
+$1 == "transitions" {
+	num_transitions = $2;
+
+	# process the transitions and map them to FSM transitions and
+	# final states.
+	# FSM requires the first transition to be out of the initial state,
+	# so we scan the transitions twice.
+	# The first time, to find the initial transitions, then
+	# to add all the others. Yuck!
+	for (k = 1; k <= num_transitions; k ++) {
+		getline;
+
+		from_node = $1;
+		to_node = $2;
+		cost = $3;
+
+		if (from_node == initial_node) {
+			print_trans(from_node, to_node, cost);
+		} else {
+			print > tmpfile;
+		}
+	}
+	close(tmpfile);
+
+	# output definition of the final node
+	print_final();
+
+	# now process all the non-initial transitions
+	while (getline < tmpfile) {
+		from_node = $1;
+		to_node = $2;
+		cost = $3;
+
+		print_trans(from_node, to_node, cost);
+	}
+
+	next;
+}
+
+END {
+	# dump out the symbol table
+	if (symbolfile) {
+		for (s in output_symbols) {
+			print s, output_symbols[s] > symbolfile;
+		}
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/pfsg-vocab.gawk
+++ b/language_model/srilm-1.7.3/utils/src/pfsg-vocab.gawk
@@ -0,0 +1,35 @@
+#!/usr/local/bin/gawk -f
+#
+# pfsg-vocab --
+#	extract vocabulary used in PFSG
+#
+# usage: pfsg-vocab PFSG-FILE ... > VOCAB
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-vocab.gawk,v 1.1 2003/02/18 18:33:04 stolcke Exp $
+#
+
+BEGIN {
+	null = "NULL";
+}
+
+$1 == "nodes" {
+	for (i = 3; i <= NF; i ++) {
+		if ($i != null) {
+			is_word[$i] = 1;
+		}
+	}
+	next;
+}
+
+$1 == "name" {
+	# sub-pfsg names are not words, and might have been added during the
+	# processing of the nodes list
+	delete is_word[$2];
+}
+
+END {
+	for (word in is_word) {
+		print word;
+	}
+}
+
--- a/language_model/srilm-1.7.3/utils/src/ppl-from-log.gawk
+++ b/language_model/srilm-1.7.3/utils/src/ppl-from-log.gawk
@@ -0,0 +1,55 @@
+#!/usr/local/bin/gawk -f
+#
+# ppl-from-log --
+#	Recomputes perplexity from (a subset of) the output of 
+#
+#		ngram -debug 2 -ppl 
+#
+#	This is useful if one wants to analyse predictability of certain
+#	words/contexts.
+#
+# usage: ppl-from-log [howmany=<numsents>] ppl-log-file
+#
+# Copyright (c) 1995, SRI International.  All Rights Reserved
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/ppl-from-log.gawk,v 1.4 2014-07-03 05:57:09 stolcke Exp $
+#
+function result () {
+	ppl = exp(-sum/(sentences + words - oovs) * M_LN10);
+	printf "file %s: %d sentences, %d words, %d oovs\n", \
+		FILENAME, sentences, words, oovs;
+	printf "%d zeroprobs, logprob= %f, ppl= %f\n", \
+			 0, sum , ppl;
+}
+
+BEGIN {
+	M_LN10 = 2.30258509299404568402;	# from <math.h>
+}
+
+/^	p\( / {
+	if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) {
+		oovs ++;
+	} else {
+		sum += $10;
+	}
+	if ($2 == "</s>") {
+		sentences ++;
+	} else {
+		words ++;
+	}
+	next;
+}
+/ ppl= / {
+	sents ++;
+	if (howmany > 0 && sents == howmany) {
+		result();
+		exit 0;
+	}
+	next;
+}
+{
+	next;
+}
+END {
+	result();
+}
--- a/language_model/srilm-1.7.3/utils/src/prettify.gawk
+++ b/language_model/srilm-1.7.3/utils/src/prettify.gawk
@@ -0,0 +1,34 @@
+#!/usr/local/bin/gawk -f
+#
+# Map words in a text file to zero of more expansions
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/prettify.gawk,v 1.1 2001/03/24 06:41:31 stolcke Exp $
+#
+NR == 1 {
+	# read pretty map file
+	if (map) {
+	    while ((getline mapline < map) > 0) {
+		npretty = split(mapline, pretty_list);
+		word = pretty_list[1];
+		pretty_map[word] = "";
+		for (i = 2; i <= npretty; i ++) {
+		    pretty_map[word] = pretty_map[word] " " pretty_list[i];
+		}
+	    }
+	}
+}
+
+function pretty_up() {
+	for (i = 1; i <= NF; i ++) {
+	    if ($i in pretty_map) {
+		$i = pretty_map[$i];
+	    }
+	    if (multiwords) gsub("_", " ", $i);
+	}
+}
+
+{
+	pretty_up();
+	print;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/rank-vocab.gawk
+++ b/language_model/srilm-1.7.3/utils/src/rank-vocab.gawk
@@ -0,0 +1,141 @@
+#!/usr/local/bin/gawk -f
+#
+# rank-vocab --
+#	Given K different rankings of candidate vocabularies, and 
+# 	a held-out optimization unigram count file, optimize the 
+#	combined ranking of words
+#
+# usage: rank-vocab counts words1 words2 ... worksK
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rank-vocab.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
+#
+
+BEGIN {
+	num_sources = 0;
+	num_output = 0;
+	num_oovs = 0;
+
+	debug = 0;
+}
+
+
+ARGIND == 1 {
+	word_count[$1] = $2;
+
+	num_oovs += $2;
+
+	next;
+}
+
+ARGIND > 1 {
+	k = ARGIND - 1;
+	num_sources = k;
+
+	num_words[k] ++;
+
+	word_ranked[k, num_words[k]] = $1;
+	next;
+}
+
+function dump_words(k) {
+	print "source " k " words:";
+
+	for (i = 1; i <= num_words[k]; i ++) {
+	    print i, word_ranked[k,i];
+	}
+}
+
+# find the next word from source k that occurs in the test set
+# return 0 if no more words are available
+function find_next(k) {
+	for (j = last_chosen[k] + 1; j <= num_words[k]; j ++) {
+	    if (word_count[word_ranked[k,j]] > 0) {
+		if (debug) {
+		    print "next word rank for source " k ": " j >> "/dev/stderr";
+		}
+
+		return j;
+	    }
+	}
+	if (debug) {
+	    print "no more words from source " k >> "/dev/stderr";
+	}
+	return 0;
+}
+
+# compute gain (number of OOVs tokens reduced per number of word types added)
+# by adding the next word from source k
+function compute_gain(k) {
+	if (next_word[k] == 0) {
+	    # no more words in source k, no gain
+	    return -1;
+	} else {
+	    g = word_count[word_ranked[k,next_word[k]]] / (next_word[k] - last_chosen[k]);
+	    if (debug) {
+		print "next gain for source " k " = " g;
+	    }
+	    return g;
+	}
+}
+
+END {
+#	for (k = 1; k <= num_sources; k ++) {
+#	    dump_words(k);
+#	}
+
+	for (k = 1; k <= num_sources; k ++) {
+	    last_chosen[k] = 0;
+	    next_word[k] = find_next(k);
+	    gain[k] = compute_gain(k);
+	}
+
+	print "INITIAL OOVS = " num_oovs;
+
+	# add words until no more gain possible (i.e., until all source
+	# words have been used up)
+	while (1) {
+	    best_gain = -1;
+	    best_source = 0;
+
+	    # find next best source to pick word from
+	    for (k = 1; k <= num_sources; k ++) {
+		if (gain[k] > best_gain) {
+			best_source = k;
+			best_gain = gain[k];
+		}
+	    }
+
+	    if (best_gain < 0) break;
+
+	    # process all the words from source k up to the one chosen 
+	    for (i = last_chosen[best_source] + 1; \
+		 i <= next_word[best_source]; \
+		 i ++) {
+		word_chosen = word_ranked[best_source,i] 
+
+		if (debug) {
+		    print "source = " best_source \
+			  " gain = " best_gain \
+			  " word = " word_chosen >> "/dev/stderr";
+		}
+
+		# output the word if it hasn't been already
+		if (!was_output[word_chosen]) {
+		    num_output ++;
+
+		    num_oovs -= word_count[word_chosen];
+
+		    print "RANK " num_output " WORD " word_chosen \
+				" OOVS " num_oovs;
+
+		    was_output[word_chosen] = 1;
+		}
+	    }
+
+	    # update the statistics for the source that was chosen
+	    last_chosen[best_source] = next_word[best_source];
+	    next_word[best_source] = find_next(best_source);
+	    gain[best_source] = compute_gain(best_source);
+	}
+}
+
--- a/language_model/srilm-1.7.3/utils/src/remove-lowprob-ngrams.gawk
+++ b/language_model/srilm-1.7.3/utils/src/remove-lowprob-ngrams.gawk
@@ -0,0 +1,106 @@
+#!/usr/local/bin/gawk -f
+#
+# remove-lowprob-ngrams --
+#	Remove ngrams from a backoff LM that have lower prob than their
+#	backoff paths.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/remove-lowprob-ngrams.gawk,v 1.4 2004/11/02 02:00:35 stolcke Exp $
+#
+
+NF == 0 {
+	print;
+	next;
+}
+
+/^ngram *[0-9][0-9]*=/ {
+	order = substr($2,1,index($2,"=")-1);
+	if (order > 3) {
+	    print "warning: can only handle bigrams and trigrams" >> "/dev/stderr";
+	}
+	if (order > maxorder && $2 !~ /=0$/) {
+	    maxorder = order;
+	}
+	print;
+	next;
+}
+
+/^\\[0-9]-grams:/ {
+	currorder=substr($0,2,1);
+	print;
+	next;
+}
+/^\\/ {
+	print;
+	next;
+}
+
+#
+# unigrams
+#
+currorder == 1 {
+	word = $2;
+	uni_prob[word] = $1;
+	if (NF > 2) {
+	    uni_bow[word] = $3;
+	}
+	print;
+}
+
+#
+# bigrams
+#
+currorder == 2 {
+	prob = $1;
+	word1 = $2;
+	word2 = $3;
+	words = $2 " " $3;
+
+	if (maxorder > 2) {
+	    bi_prob[words] = prob;
+	    if (NF > 3) {
+		bi_bow[words] = $4;
+	    }
+	}
+
+	total_bigrams ++;
+	if (uni_bow[word1] + uni_prob[word2] <= prob) {
+	    print;
+	} else {
+	    removed_bigrams ++;
+	}
+}
+
+#
+# trigrams
+#
+currorder == 3 {
+	prob = $1;
+	word1 = $2;
+	word2 = $3;
+	word3 = $4;
+
+	if (word2 " " word3 in bi_prob) {
+	    backoff_prob = bi_bow[word1 " " word2] + bi_prob[word2 " " word3];
+	} else {
+	    backoff_prob = bi_bow[word1 " " word2] + \
+					uni_bow[word2] + uni_prob[word3];
+	}
+
+	total_trigrams ++;
+	if (backoff_prob <= prob) {
+	    print;
+	} else {
+	    removed_trigrams ++;
+	}
+}
+
+END {
+	if (total_bigrams > 0) {
+	    printf "%d out of %d bigrams removed\n", \
+			removed_bigrams, total_bigrams >> "/dev/stderr";
+	}
+	if (total_trigrams > 0) {
+	    printf "%d out of %d trigrams removed\n", \
+			removed_trigrams, total_trigrams >> "/dev/stderr";
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/replace-unk-words.gawk
+++ b/language_model/srilm-1.7.3/utils/src/replace-unk-words.gawk
@@ -0,0 +1,41 @@
+#!/usr/local/bin/gawk -f
+#
+# replace-unk-words --
+#	replace OOV words with <unk> tag
+#
+# usage: replace-unk-words vocab=<vocabfile> text > text-with-unk
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/replace-unk-words.gawk,v 1.1 2013/12/11 08:32:48 stolcke Exp $
+#
+
+BEGIN {
+	unk = "<unk>";
+}
+
+NR == 1 {
+	if (vocab != "") {
+	    nwords = 0;
+	    while ((getline line < vocab) > 0) {
+		if (split(line, w, " ") > 0) {
+		    is_word[w[1]] = 1;
+		    nwords += 1;
+		}
+	    }
+	    close(vocab);
+	    print "read " nwords " words" > "/dev/stderr";
+	}
+
+	is_word[unk] = 1;
+	is_word["<s>"] = 1;
+	is_word["</s>"] = 1;
+}
+
+{
+	for (i = 1; i <= NF; i ++) {
+	    if (!($i in is_word)) {
+		$i = unk;
+	    }
+	}
+	print;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/replace-words-with-classes.gawk
+++ b/language_model/srilm-1.7.3/utils/src/replace-words-with-classes.gawk
@@ -0,0 +1,223 @@
+#!/usr/local/bin/gawk -f
+#
+# replace-with-words-classes --
+#	replace class expansions with class names
+#
+# usage: replace-with-words-classes classes=<classfile> text > text-with-classes
+#        replace-with-words-classes classes=<classfile> have_counts=1 counts \
+#							> counts-with-classes
+#
+# optional arguments:
+#	outfile=<file>	output file for class expansion counts (default: none)
+#	normalize=<0|1>	normalize counts to probabilities (default = 1)
+#	addone=<count>	value to add to counts for probability smoothing (1)
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/replace-words-with-classes.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $
+#
+
+function read_classes(file) {
+	
+    num_class_defs = 0;
+    delete num_class_expansions;
+    delete class_expansions;
+    delete class_expansion_probs;
+
+    while ((getline line < file) > 0) {
+
+	n = split(line, a);
+	if (n == 0) continue;
+
+	class = a[1];
+	num_exp = ++ num_class_expansions[class];
+
+	if (a[2] ~ /^[-+0-9.][-+0-9e.]*$/) {
+		prob = a[2];
+		i = 3;
+	} else {
+		prob = "";
+		i = 2;
+	}
+	
+	expansion = a[i];
+	for (i++; i <= n; i++) {
+	    expansion = expansion " " a[i];
+	}
+
+	class_expansions[class " " num_exp] = expansion;
+	if (prob != "") {
+	    class_expansion_probs[class " " num_exp] = prob;
+	}
+	num_class_defs ++;
+    }
+
+    print "read " num_class_defs " class expansions" >> "/dev/stderr";
+
+    # assign default expansion probs
+
+    for (class in num_class_expansions) {
+
+	num_exp =  num_class_expansions[class];
+
+	for (i = 1; i <= num_exp; i ++) {
+	    if (class_expansion_probs[class " " i] == "") {
+		class_expansion_probs[class " " i] = 1/num_exp;
+	    }
+	}
+	
+    }
+}
+
+##############################################################################
+
+function add_to_prefix_tree(class, expansion, prob) {
+
+    nwords = split(expansion, w);
+
+    node = 0;
+
+    for (k = 1; k <= nwords; k ++) {
+	next_node = tree[node " " w[k]];
+
+	if (!next_node) {
+	    next_node = ++num_nodes;
+	    tree[node " " w[k]] = next_node;
+	}
+
+	node = next_node;
+    }
+
+    if (!(node in node_class)) {
+	node_class[node] = class;
+	node_prob[node] = prob;
+    }
+    return node;
+}
+
+BEGIN {
+    normalize = 1;
+    addone = 1;
+    partial = 0;
+}
+
+NR == 1 {
+    if (classes) {
+	read_classes(classes);
+	close(classes);
+    } else {
+	print "no classes file specified" >> "/dev/stderr";
+    }
+
+    for (class in num_class_expansions) {
+	for (i = 1; i <= num_class_expansions[class]; i ++) {
+	    class_expansion_node[class " " i] = \
+		add_to_prefix_tree(class, class_expansions[class " " i], \
+				    class_expansion_probs[class " " i]);
+	}
+    }
+}
+	
+{
+    output = "";
+    next_pos = 1;
+
+
+    # partial option: multiple spaces block multiword replacement
+    if (partial) {
+	gsub("[ 	][ 	]*[ 	]", " | ");
+    }
+
+    #
+    # handle ngram counts by simply leaving the count value alone
+    # and doing substitution on the ngram itself.
+    #
+    if (have_counts) {
+	max_pos = NF - 1;
+    } else {
+	max_pos = NF;
+    }
+
+    while (next_pos <= max_pos) {
+
+	class = "";
+	prob = 0;
+	num_exp_words = 0;
+
+	# search for largest class expansion starting at current position
+	node = 0;
+	k = 0;
+	while (1) {
+	    node = tree[node " " $(next_pos + k)];
+
+	    if (node) {
+		if (node in node_class) {
+		    # we have found a complete expansion, record its class
+		    class = node_class[node];
+		    class_node = node;
+		    prob = node_prob[prob];
+		    num_exp_words = k + 1;
+		}
+	    } else {
+		break;
+	    }
+	    k ++;
+	}
+
+	if (next_pos == 1) {
+	    space = "";
+	} else {
+	    space = " ";
+	}
+
+	if (!class) {
+	    output = output space $next_pos;
+	    next_pos ++;
+	} else {
+	    output = output space class;
+	    next_pos += num_exp_words;
+
+	    node_count[class_node] ++;
+	    class_count[class] ++;
+	}
+    }
+
+    # partial option: multiple spaces block multiword replacement
+    if (partial) {
+	gsub(" [|] ", " ", output);
+	sub("^[|]", " ", output);
+	sub("[|]$", " ", output);
+    }
+
+    if (have_counts) {
+	print output, $NF;
+    } else {
+	print output;
+    }
+}
+
+function estimate(count, total, N) {
+    denom = total + N *addone;
+
+    if (denom == 0) {
+	return 0;
+    } else {
+	return (count + addone)/denom;
+    }
+}
+
+END {
+    if (outfile) {
+	for (class in num_class_expansions) {
+	    for (i = 1; i <= num_class_expansions[class]; i ++) {
+		nc = node_count[class_expansion_node[class " " i]] + 0;
+		print class, \
+		      normalize ? \
+				 estimate(nc, class_count[class], \
+					num_class_expansions[class]) :
+				 nc, \
+		      class_expansions[class " " i] > outfile;
+	    }
+	}
+	close(outfile);
+    }
+}
+
--- a/language_model/srilm-1.7.3/utils/src/rescore-acoustic
+++ b/language_model/srilm-1.7.3/utils/src/rescore-acoustic
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# rescore-acoustic --
+#       Replace acoustic Nbest scores with a weighted combination of
+#       old and new acoustic scores
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rescore-acoustic,v 1.8 2015-07-03 03:45:39 stolcke Exp $
+#
+
+if [ $# -lt 5 ]; then
+        echo "usage: $0 old-nbest-dir old-ac-weight new-score-dir1 new-ac-weight1 new-score-dir2 new-ac-weight2 ... new-nbest-dir [max-nbest]" >&2
+        echo "    or $0 old-file-list old-ac-weight new-score-dir1 new-ac-weight1 new-score-dir2 new-ac-weight2 ... new-nbest-dir [max-nbest]" >&2
+        exit 1
+fi
+
+old_nbest=${1}
+old_acw=${2}
+shift; shift
+
+new_scores=
+new_acw=
+while [ $# -ge 3 ]
+do
+	new_scores="$new_scores $1"
+	new_acw="$new_acw $2"
+	shift; shift
+done
+new_nbest=${1}
+max_nbest=${2-0}
+
+set -e
+
+tmpdir=${TMPDIR-/tmp}
+join1="$tmpdir/join1_$$"
+join2="$tmpdir/join2_$$"
+trap "rm -f $join1 $join2" 0 1 2 15
+
+echo "generating sentids ..." >&2
+if [ -d $old_nbest ]; then
+	find $old_nbest/. -follow -type f -print
+else
+	cat $old_nbest 
+fi | \
+sed -e 's,.*,& &,' -e 's,[^ ]*/,,' -e 's,\.gz , ,' -e 's,\.score , ,' | \
+sort -k 1,1 > $join1
+
+echo "`wc -l < $join1` utterances" >&2
+
+for d in $new_scores
+do
+	echo "joining $d ..." >&2
+	find $d/. -follow -type f -print | \
+	sed -e 's,.*,& &,' -e 's,[^ ]*/,,' -e 's,\.gz , ,' |\
+	sort -k 1,1 | \
+	/usr/local/gnu/bin/join $join1 - > $join2
+	mv $join2 $join1
+done
+echo "`wc -l < $join1` utterances after joining" >&2
+
+mkdir -p $new_nbest
+
+cat $join1 | \
+while read sentid scorefiles
+do
+	echo $sentid >&2
+	combine-acoustic-scores -v "weights=$old_acw $new_acw" \
+		-v max_nbest=$max_nbest $scorefiles | \
+		gzip > $new_nbest/$sentid.score.gz 
+done
+
--- a/language_model/srilm-1.7.3/utils/src/rescore-decipher
+++ b/language_model/srilm-1.7.3/utils/src/rescore-decipher
@@ -0,0 +1,466 @@
+#!/bin/sh
+#
+# rescore-nbest --
+#	generate scores from Decipher(TM) n-best lists
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rescore-decipher,v 1.40 2017/07/20 05:43:59 stolcke Exp $
+#
+
+bytelog=0
+nodecipherlm=0
+multiwords=0
+norescore=0
+decipher_lmw=8
+decipher_wtw=0
+lm_only=0
+pretty_file=
+filter_command=
+limit_vocab=0
+vocab_aliases=
+fast_rescore=
+ngram_tool=ngram
+ngram_options=
+count_oovs=0
+rescore_option=-rescore
+multichar=_
+tmpdir=${TMPDIR-/tmp}
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+    -bytelog)	
+	    bytelog=1
+	    ;;
+    -nodecipherlm)
+	    nodecipherlm=1
+	    ;;
+    -multiwords)
+	    multiwords=1
+	    mw_option=-multiwords
+	    smw_option=-split-multiwords
+	    ;;
+    -multi-char)
+    	    multichar="$2"; shift
+	    ;;
+    -norescore)
+	    norescore=1
+	    ;;
+    -lm-only)
+	    lm_only=1
+	    ;;
+    -count-oovs)
+	    count_oovs=1
+	    rescore_option="-debug 1 -ppl"
+	    ;;
+    -pretty)
+	    pretty_file="$2"; shift
+	    ;;
+    -ngram-tool)
+	    ngram_tool="$2"; shift
+	    ;;
+    -filter)
+	    filter_command="$2"; shift
+	    ;;
+    -limit-vocab)
+	    limit_vocab=1
+	    ;;
+    -vocab-aliases)
+	    vocab_aliases="$2"; shift
+	    ;;
+    -fast)
+    	    fast_rescore=1
+	    ;;
+    -*)	echo "$0: unknown option $1" >&2
+	    exit 2 ;;
+    *)	    break
+	    ;;
+    esac
+
+    shift
+done
+
+if [ $# -lt 3  ]; then
+    {
+	echo "usage: $0 [-bytelog] [-nodecipherlm] [-multiwords] [-multi-char C] [-norescore] [-lm-only] [-count-oovs] [-pretty map] [-ngram-tool pgm] [-filter command] [-limit-vocab] [-vocab-aliases map] [-fast] nbest-file-list score-dir lm-options ..." >&2
+	echo "where"
+	echo "	-bytelog	produces bytelog scaled scores"
+	echo "	-nodecipherlm	avoids Decipher LM score computation"
+	echo "	-multiwords	expand multiwords into constituent words"
+	echo "	-multi-char C	redefine multiword separator character"
+	echo "	-norescore	don't rescore LM, just extract scores"
+	echo "	-lm-only	output no N-best lists, only LM scores"
+	echo "	-count-oovs	output number of OOV and zeroprob words"
+	echo "	-pretty map	word mapping file"
+	echo "	-ngram-tool pgm use pgm for LM evaluation"
+	echo "	-filter command	text filter to apply to N-best hyps"
+	echo "	-limit-vocab	limit LM loading to used vocabulary"
+	echo "	-vocab-aliases map	map vocabulary in LM evaluation"
+	echo "	-fast		fast rescoring mode, no text filtering allowed"
+    } >&2
+    exit 1
+fi
+
+filelist="$1"
+scoredir="$2"
+shift; shift
+
+if [ ! -d $scoredir ]; then
+	mkdir $scoredir
+fi
+
+# when not rescoring need to get decipher lmw and wtw from remaining options
+if [ $norescore -gt 0 ]; then
+    while [ $# -gt 0 ]
+    do
+	case "$1" in
+	-decipher-lmw)	
+		decipher_lmw=$2
+		shift
+		;;
+	-decipher-wtw)
+		decipher_wtw=$2
+		shift
+		;;
+	*)	shift
+		;;
+	esac
+    done
+fi
+
+if [ $norescore -eq 0 -a $limit_vocab -gt 0 ]; then
+    #
+    # limit LM vocabulary to words found in the nbest lists
+    #
+
+    nbestvocab="$tmpdir/$$nbest.vocab"
+    trap "rm -f $nbestvocab; exit" 0 1 2 15
+
+    # generate nbest vocabulary
+    if [ -z "$filter_command" ]; then 
+	nbest-lattice -no-rescore -no-reorder \
+		$mw_option -multi-char "$multichar" \
+		-nbest-files "$filelist" -write-vocab $nbestvocab
+    else
+	cat "$filelist" | xargs gzip -dcf | \
+	eval "$filter_command" | \
+	ngram -rescore - -null -no-reorder \
+			$smw_options -multi-char "$multichar" \
+			-write-vocab $nbestvocab >/dev/null
+    fi
+
+    # tell ngram to use this vocab
+    ngram_options="-limit-vocab -vocab $nbestvocab"
+
+fi
+
+if [ $norescore -eq 0 -a -n "$vocab_aliases" ]; then
+    if [ $limit_vocab -gt 0 ]; then
+	nbestvocabalias="$tmpdir/$$nbest.vocabalias"
+	trap "rm -f $nbestvocab $nbestvocabalias; exit" 0 1 2 15
+
+	sort -k 2,2 $vocab_aliases | \
+	join -1 2 -o 1.1,1.2 - $nbestvocab > $nbestvocabalias
+
+	# tell ngram to use these vocab-aliases
+	ngram_options="$ngram_options -vocab-aliases $nbestvocabalias"
+    else
+	# tell ngram to use this vocab-alias
+	ngram_options="-vocab-aliases $vocab_aliases"
+    fi
+fi
+
+if [ -n "$fast_rescore" ]; then
+
+#
+# Fast rescoring mode:
+#	Hand N-best lists directly to ngram. No text filtering is supported
+#
+
+	if [ -n "$pretty_file" -o -n "$filter_command" -o $lm_only -gt 0 -o $count_oovs -gt 0 ]
+        then
+		echo "Text filtering, -lm-only, and -count-oovs not supported with -fast" >&2
+		exit 2
+    	fi
+
+	if [ $nodecipherlm -eq 0 ]; then
+		echo "Must use -nodecipherlm with -fast" >&2
+		exit 2
+    	fi
+
+	if [ $norescore -gt 0 ]; then
+		nbest-lattice -no-rescore -no-reorder $mw_option \
+			-nbest-files "$filelist" \
+			-write-nbest-dir "$scoredir"
+	else 
+		if [ "$multiwords" -gt 0 ]; then
+			mw_option=-split-multiwords
+		fi
+		$ngram_tool \
+			-no-reorder $mw_option -multi-char "$multichar" \
+			-nbest-files "$filelist" \
+			-write-nbest-dir "$scoredir" \
+			-rescore-lmw 1 -rescore-wtw 1 \
+			$ngram_options "$@"
+	fi
+
+else # fast_rescore 
+
+#
+# General rescoring mode:
+#	Concatenate hyps for all nbest list, record number of hyps for
+#		each file in the output stream
+#	Feed to ngram -rescore (using lm-options)
+#		or using -ppl for counting OOVs
+#	Parse ngram output into lm scores and deposit into target files
+#
+
+escape="***FILE:"
+
+cat $filelist | ( \
+while read filename rest; do
+	case $filename in
+	# preserve LMstate labels in the file list and pass them to ngram
+	"<LMstate>")	echo $filename $rest
+			continue ;;
+	esac
+	gzip -dcf $filename | \
+${GAWK-gawk} '
+BEGIN {
+	filename = "";
+	numhyps = 0;
+	nbestformat = 0;
+
+	# constants
+	bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;
+	pause = "-pau-";
+}
+
+function bytelog2log10(x) {
+	return x / bytelogscale;
+}
+
+NR == 1 {
+	sentid = filename;
+	sub("^.*/", "", sentid);
+	sub("\\.gz$", "", sentid);
+	sub("\\.Z$", "", sentid);
+	sub("\\.score$", "", sentid);
+	sub("\\.wv$", "", sentid);
+	sub("\\.wav$", "", sentid);
+	sub("\\.wav_cep$", "", sentid);
+
+	# read pretty map file
+	if (pretty_file) {
+	    while ((getline mapline < pretty_file) > 0) {
+		npretty = split(mapline, pretty_list);
+		word = pretty_list[1];
+		pretty_map[word] = "";
+		for (i = 2; i <= npretty; i ++) {
+		    pretty_map[word] = pretty_map[word] " " pretty_list[i];
+		}
+	    }
+	}
+
+	print escape, sentid;
+}
+
+function pretty_up(start) {
+	for (i = start; i <= NF; i ++) {
+	    if ($i in pretty_map) {
+		$i = pretty_map[$i];
+	    }
+	    if (multiwords) gsub(multichar, " ", $i);
+	}
+}
+
+/^NBestList1\.0/ {
+	nbestformat = 1;
+	if (nodecipherlm) {
+	    printf "%s: -nodecipherlm ineffective for NBestList1.0\n", filename > "/dev/stderr" ;
+	}
+	next;
+}
+/^NBestList2\.0/ {
+	nbestformat = 2;
+	next;
+}
+{
+	numhyps ++;
+	if (nbestformat == 0) {
+	    pretty_up(4);
+	    if (count_oovs) {
+		# output only the words, add <s> to handle empty hyps
+		$1 = $2 = $3 = "";
+		print "<s>", $0;
+	    } else {
+		print;
+	    }
+	} else if (nbestformat == 1) {
+	    pretty_up(2);
+
+	    if (count_oovs) {
+		# output only the words, add <s> to handle empty hyps
+		$1 = "";
+		print "<s>", $0;
+	    } else if (norescore) {
+		# convert to SRILM format
+		score = substr($1,2,length($1)-2);
+		$1 = "";
+	    	print bytelog2log10(score), 0, 0, $0;
+	    } else {
+		# keep Decipher format
+		print;
+	    }
+	} else if (nbestformat == 2) {
+	    score = substr($1,2,length($1)-2);
+
+	    # compute total AC and LM scores 
+	    lm_score = 0;
+	    num_words = 0;
+	    num_pauses = 0;
+
+	    words = "";
+	    prev_end_time = -1;
+	    for (i = 2; i <= NF; i += 11) {
+		start_time = $(i + 3);
+		end_time = $(i + 5);
+
+		# skip tokens that are subsumed by the previous word
+		# (this eliminates phone and state symbols)
+		# XXX: due to a bug in Decipher some state tags have incorrect
+		# timemarks.  We filter them based on their token string.
+		if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
+		    words = words " " $i;
+
+		    num_words ++;
+		    if ($i == pause) num_pauses ++;
+
+		    lm_score += $(i + 7);
+
+		    prev_end_time = end_time;
+		}
+	    }
+
+	    $0 = $1 " " words;
+
+	    pretty_up(2);
+
+	    # Compute AC score from total and lm scores. This takes into
+	    # account that the recognizer might sum scores of equivalent hyps
+	    # (e.g., those differing only in pauses or pronunciations) and
+	    # reflect the summing in the total score, but not in the word AC
+	    # scores.
+	    ac_score = score - lm_score;
+
+	    if (count_oovs) {
+		# output only the words, add <s> to handle empty hyps
+		$1 = "";
+		print "<s>", $0;
+	    } else if (norescore) {
+		# convert to SRILM nbest format
+		# NOTES:
+		# - subtract Decipher WTW (including for pauses!)
+		# - compute number of words WITHOUT pauses for output
+		$1 = "";
+		print bytelog2log10(ac_score), \
+			bytelog2log10(lm_score/decipher_lmw) - \
+				numwords * decipher_wtw,  \
+			split(words, dummy) - num_pauses, $0;
+	    } else if (nodecipherlm) {
+		# output only acoustic score in Decipher format
+		$1 = "(" ac_score ")";
+		print;
+	    } else {
+		# output combined score in Decipher format
+		print;
+	    }
+	}
+}
+END {
+	if (numhyps == 0) {
+		print "WARNING: nbest list " filename " is empty" \
+			> "/dev/stderr" ;
+	}
+}
+' filename=$filename escape="$escape" count_oovs=$count_oovs \
+  nodecipherlm=$nodecipherlm multiwords=$multiwords \
+  multichar="$multichar" pretty_file="$pretty_file" \
+  norescore=$norescore decipher_lmw=$decipher_lmw decipher_wtw=$decipher_wtw 
+done
+) | \
+if [ $norescore -gt 0 -a -z "$filter_command" ]; then
+    # no rescoring and no filtering
+    cat
+elif [ $norescore -gt 0 ]; then
+    # no resoring, but filter hyps
+    eval "$filter_command"
+elif [ -z "$filter_command" ]; then
+    # standard rescoring without filtering
+    $ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \
+		-escape "$escape " $ngram_options "$@" 
+else
+    # rescoring with filtering
+    eval "$filter_command" | \
+    $ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \
+		-escape "$escape " $ngram_options "$@" 
+fi | \
+${GAWK-gawk} -v bytelog=$bytelog '
+BEGIN {
+	currentfile = "";
+	scoredir = "";
+	scorefile = "";
+	numhyps = 0;
+	bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;
+}
+$1 == escape {
+	if (currentfile) {
+		close(scorefile);
+	}
+	currentfile = $2;
+	sub("
+$", "", currentfile);
+	if (!lm_only && !count_oovs) {
+	    # backward compatibility
+	    currentfile = currentfile ".score";
+	}
+	scorefile = "gzip > " scoredir "/" currentfile ".gz";
+	printf "processing hyps for %s\n", currentfile \
+		> "/dev/stderr" ;
+	hypno = 0;
+	next;
+}
+# parse ngram -ppl output to get OOV (including zeroprobs) count
+count_oovs && $6 == "OOVs" {
+	num_oovs = $5;
+	next;
+}
+count_oovs && $2 == "zeroprobs," {
+	num_oovs += $1;
+	print num_oovs | scorefile;
+	next;
+}
+# process ngram -rescore output
+!count_oovs {
+	if ($2 ~ /NaN/) {
+	    print "WARNING: LM score in nbest list " currentfile " is NaN" \
+							    > "/dev/stderr" ;
+	    $2 = -100000;
+	}
+		
+	if (bytelog) {
+	    $1 = $1 * bytelogscale;
+	    $2 = $2 * bytelogscale;
+	}
+	if (lm_only) {
+	    print $2 | scorefile;
+	} else  {
+	    print | scorefile;
+	}
+}
+END {
+	if (currentfile) {
+		close(scorefile);
+	}
+}
+' scoredir=$scoredir escape="$escape" bytelog=$bytelog lm_only=$lm_only count_oovs=$count_oovs
+
--- a/language_model/srilm-1.7.3/utils/src/rescore-minimize-wer
+++ b/language_model/srilm-1.7.3/utils/src/rescore-minimize-wer
@@ -0,0 +1,43 @@
+#!/bin/sh
+#
+# rescore-minimize-wer --
+#	minimize posterior expected WER in an nbest-list
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rescore-minimize-wer,v 1.7 2013/03/09 07:13:01 stolcke Exp $
+#
+
+if [ $# -lt 1 ]; then
+	echo "usage: $0: score-dir [lmw [wtw [max-nbest]]]" >&2
+	exit 1
+fi
+
+scoredir="$1"
+lmweight="${2-8.0}"
+wtweight="${3-0.0}"
+maxnbest="${4-10}"
+
+find $scoredir -follow -type f \( -name \*.score -o \
+			-name \*.score.Z -o \
+			-name \*.score.gz \) \
+		-print | sort | \
+while read file
+do
+	case $file in
+	*.Z)	cat="gzip -dcf"
+		sentid=`basename $file .score.Z`
+		;;
+	*.gz)	cat="gzip -dcf"
+		sentid=`basename $file .score.gz`
+		;;
+	*)	cat=cat
+		sentid=`basename $file .score`
+		;;
+	esac
+	${GAWK-gawk} -v sentid="$sentid" 'BEGIN { printf "%s ", sentid }'
+	$cat $file | \
+	sed -e 's,-pau-,,g' -e 's,\[[^]]*\],,g' | \
+	nbest-lattice -wer -debug 1 -rescore - \
+		-rescore-lmw $lmweight -rescore-wtw $wtweight \
+		-max-rescore $maxnbest
+done
+
--- a/language_model/srilm-1.7.3/utils/src/rescore-nbest
+++ b/language_model/srilm-1.7.3/utils/src/rescore-nbest
@@ -0,0 +1,77 @@
+#!/bin/sh
+#
+# rescore-nbest --
+#	output LM scores for nbest lists
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rescore-nbest,v 1.3 1996/03/28 19:12:01 stolcke Exp $
+#
+
+if [ $# -lt 3  ]; then
+	echo "usage: $0: nbest-file-list score-dir lm-options ..." >&2
+	exit 1
+fi
+
+filelist="$1"
+scoredir="$2"
+shift; shift
+
+#
+# STRATEGY:
+#	Concatenate hyps for all nbest list, record number of hyps for
+#		each file in the output stream
+#	Strip hyp ids, !SENT_START, !SENT_END 
+#	Feed to ngram -ppl (using lm-options)
+#	Parse ngram output into lm scores and deposit into target files
+#
+
+escape="***FILE:"
+
+cat $filelist | ( \
+while read filename; do
+	set -e
+	numhyps=`wc -l < $filename`
+	echo "$escape `basename $filename .trans`.score $numhyps"
+	sed \
+		-e 's/^ *([^ ]*) //' \
+		-e 's/!SENT_START //' \
+		-e 's/!SENT_END //' \
+		$filename
+done
+)  | \
+ngram -debug 1 -ppl - -escape "$escape " "$@" | \
+gawk '
+BEGIN {
+	currentfile = "";
+	scoredir = "";
+	scorefile = "";
+	numhyps = 0;
+	M_LN10 = 2.30258509299404568402;		# from <math.h>
+}
+$1 == escape {
+	if (currentfile) {
+		close(scorefile);
+	}
+	currentfile = $2;
+	scorefile = scoredir "/" currentfile;
+	numhyps = $3;
+	printf "processing %d hyps for %s\n", numhyps, currentfile;
+	hypno = 0;
+	next;
+}
+/logprob=/ {
+	logprob = $4;
+
+	hypno ++;
+
+	# rescale LM scores to natural logs
+	printf "%g\n", logprob * M_LN10 > scorefile;
+
+	next;
+}
+END {
+	if (currentfile) {
+		close(scorefile);
+	}
+}
+' scoredir=$scoredir escape="$escape"
+
--- a/language_model/srilm-1.7.3/utils/src/rescore-reweight
+++ b/language_model/srilm-1.7.3/utils/src/rescore-reweight
@@ -0,0 +1,134 @@
+#!/bin/sh
+#
+# rescore-reweight
+#	reweight nbest-list scores and select top hyps
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rescore-reweight,v 1.20 2013/03/09 07:13:01 stolcke Exp $
+#
+
+multiwords=0
+multichar=_
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+    -multiwords)
+	    multiwords=1
+	    ;;
+    -multi-char)
+	    multichar="$2"
+	    shift
+	    ;;
+    -*)	echo "$0: unknown option $1" >&2
+	    exit 2 ;;
+    *)	    break
+	    ;;
+    esac
+
+    shift
+done
+
+if [ $# -lt 1 ]; then
+	echo "usage: $0 [-multiwords] [-multi-char C] score-dir [lmw [wtw [scoredir weight ...] [max-nbest]]]" >&2
+	echo "    or $0 [-multiwords] [-multi-char C] file-list [lmw [wtw [scoredir weight ...] [max-nbest]]]" >&2
+	exit 1
+fi
+
+scoredir="$1"
+shift
+
+lmweight="${1-8.0}"
+[ $# -gt 0 ] && shift
+wtweight="${1-0.0}"
+[ $# -gt 0 ] && shift
+
+extra_scoredirs=
+extra_weights=
+while [ $# -gt 1 ]; do
+	extra_scoredirs="$extra_scoredirs $1"
+	extra_weights="$extra_weights $2"
+	shift; shift
+done
+
+maxnbest="${1-100000}"
+
+# prevent "broken pipe" from $cat below when maxnbest truncates list
+trap '' 13
+
+if [ -d $scoredir ]; then
+    find $scoredir -follow -type f \( -name \*.score -o \
+			    -name \*.score.Z -o \
+			    -name \*.gz \) \
+		    -print | sort
+else
+    cat $scoredir
+fi | \
+while read file
+do
+	case $file in
+	*.score.Z)	cat="gzip -dcf"
+		sentid=`basename $file .score.Z`
+		;;
+	*.score.gz)	cat="gzip -dcf"
+		sentid=`basename $file .score.gz`
+		;;
+	*.score)	cat=cat
+		sentid=`basename $file .score`
+		;;
+	*)	# use nbest-lattice to convert Decipher nbest format 
+		cat="nbest-lattice -no-rescore -no-reorder -keep-noise -write-nbest - -nbest"
+		sentid=`basename $file .gz`
+		;;
+	esac
+
+	if [ -z "$extra_scoredirs" ]; then
+	    $cat $file
+	else
+	    extra_scores=
+	    for dir in $extra_scoredirs
+	    do
+		if [ -f $dir/$sentid.gz ]; then
+			extra_scores="$extra_scores $dir/$sentid.gz"
+		elif [ -f $dir/$sentid ]; then
+			extra_scores="$extra_scores $dir/$sentid"
+		else
+			echo "$dir/$sentid" is missing >&2
+			extra_scores="$extra_scores /dev/null"
+		fi
+	    done
+
+	    $cat $file | \
+	    combine-acoustic-scores \
+			-v "weights=1 $extra_weights" \
+			-v max_nbest=$maxnbest \
+			- $extra_scores
+	fi | \
+	${GAWK-gawk} '
+BEGIN {
+	hypnum = 0;
+}
+NF >= 3 {
+	hypnum ++;
+	if (hypnum > maxnbest) exit 0;
+
+	totalscore = $1 + lmweight * $2 + wtweight * $3;
+
+	if (!winner || totalscore > maxscore) {
+		maxscore = totalscore;
+		winner = $0;
+		winrank = hypnum;
+		besthyp = "";
+		for (i = 4; i <= NF; i++) besthyp = besthyp " " $i;
+	}
+}
+END {
+	# resolve multiwords if requested
+	if (multiwords) {
+		gsub(multichar, " ", besthyp);
+	}
+	print sentid besthyp;
+	printf "%s: best hyp is %d\n", sentid, winrank > "/dev/stderr";
+}
+' sentid="$sentid" lmweight="$lmweight" wtweight="$wtweight" maxnbest="$maxnbest" multiwords=$multiwords multichar="$multichar"
+done
+
--- a/language_model/srilm-1.7.3/utils/src/reverse-lm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/reverse-lm.gawk
@@ -0,0 +1,85 @@
+#!/usr/local/bin/gawk -f
+#
+# reverse-lm --
+#	reverse N-grams in a backoff LM
+#
+# usage: reverse-lm lm-file > rev-lm-file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/reverse-lm.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
+#
+
+BEGIN {
+	start_tag = "<s>";
+	end_tag = "</s>";
+
+	renorm_command = "ngram -debug 1 -order 2 -lm - -renorm -write-lm -";
+}
+NF==0 {
+	print | renorm_command;
+	next;
+}
+/^ngram *[0-9][0-9]*=/ {
+	order = substr($2,1,index($2,"=")-1);
+
+	if (order > 2) {
+		print "can handle bigram LMs only" >> "/dev/stderr";
+		exit(2);
+	}
+	print | renorm_command;
+	next;
+}
+/^\\[0-9]-grams:/ {
+	currorder=substr($0,2,1);
+	print | renorm_command;
+	next;
+}
+/^\\/ {
+	print | renorm_command;
+	next;
+}
+currorder == 1 {
+	# unigrams are copied unchanged
+	# store probs for later use
+
+	prob = $1;
+	word = $2;
+	if (word == start_tag) {
+	    ; # get <s> unigram prob from </s>
+	} else if (word == end_tag) {
+	    uniprob[start_tag] = uniprob[end_tag] = prob;
+	} else {
+	    uniprob[word] = prob;
+	}
+
+	# add dummy backoff weight
+	$3 = "0";
+	print | renorm_command;
+	next;
+}
+
+function map_tags(w) {
+	if (w == start_tag) {
+	    return end_tag;
+	} else if (w == end_tag) {
+	    return start_tag;
+	} else {
+	    return w;
+	}
+}
+
+currorder == 2 {
+	# bigrams are reverse and new probabilities are assigned 
+	prob = $1;
+	w1 = map_tags($2);
+	w2 = map_tags($3);
+
+	# p_rev(w1|w2) = p(w1) p(w2|w1) / p(w2)
+	new_prob = uniprob[w1] + prob - uniprob[w2];
+
+	if (new_prob > 0) {
+		print "warning: p(" w1 "|" w2 ") > 0" >> "/dev/stderr";
+	}
+
+	print new_prob "\t" w2 " " w1 | renorm_command;
+	next;
+}
--- a/language_model/srilm-1.7.3/utils/src/reverse-ngram-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/reverse-ngram-counts.gawk
@@ -0,0 +1,28 @@
+#!/usr/local/bin/gawk -f
+#
+# reverse-ngram-counts --
+#	Reverse the word order in N-gram count files
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/reverse-ngram-counts.gawk,v 1.2 2017/07/31 18:18:50 stolcke Exp $
+#
+BEGIN {
+	start_tag = "<s>";
+	end_tag = "</s>";
+}
+{
+	i = 1;
+	j = NF - 1;
+	while (i < j) {
+		h = $i;
+		$i = $j;
+		$j = h; 
+		i ++; j--;
+	}
+
+	# swap <s> and </s> tags
+	for (i = 1; i < NF; i ++) {
+	    if ($i == end_tag) $i = start_tag;
+	    else if ($i == start_tag) $i = end_tag;
+	}
+	print;
+}
--- a/language_model/srilm-1.7.3/utils/src/reverse-text.gawk
+++ b/language_model/srilm-1.7.3/utils/src/reverse-text.gawk
@@ -0,0 +1,32 @@
+#!/usr/local/bin/gawk -f
+#
+# reverse-text --
+#	Reverse the word order in a text file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/reverse-text.gawk,v 1.1 2003/01/01 18:35:23 stolcke Exp $
+#
+BEGIN {
+	start_tag = "<s>";
+	end_tag = "</s>";
+}
+{
+	if ($1 == start_tag) {
+		i = 2;
+	} else {
+		i = 1;
+	}
+
+	if ($NF == end_tag) {
+	    j = NF - 1;
+	} else {
+	    j = NF;
+	}
+
+	while (i < j) {
+		h = $i;
+		$i = $j;
+		$j = h; 
+		i ++; j--;
+	}
+	print;
+}
--- a/language_model/srilm-1.7.3/utils/src/rexport.gnumake
+++ b/language_model/srilm-1.7.3/utils/src/rexport.gnumake
@@ -0,0 +1,176 @@
+#!/bin/sh
+#
+# rexport --
+#	retrying export with customs, via gnumake
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rexport.gnumake,v 1.2 2011/07/21 19:48:19 stolcke Exp $
+#
+
+usage() {
+	echo "usage: $0 [-m] [-J numjobs] [-delay D] [-check-exec] [-f] [-debug] [-same] [-exclusive] [-exit-on-error] [-uselocal] [-attr value] ... command [args ...]" >&2
+}
+
+# allow as many file descriptors as possible for pmake
+# (this command may fail in old versions of sh -- we ignore that)
+ulimit -n `ulimit -H -n 2>/dev/null` >/dev/null 2>&1
+
+set -e 
+
+jobs=1
+makemode=0
+delay=
+check_exec=0
+exit_on_error=0
+
+#
+# parse options
+#
+attributes=
+while [ $# -gt 0 ]; do
+	case "$1" in
+	-m)	makemode=1
+		shift ;;
+	-same)	attributes="$attributes SAME"
+		shift ;;
+	-exclusive)
+		attributes="$attributes EXCLUSIVE"
+		shift ;;
+	-uselocal)
+		attributes="$attributes USELOCAL"
+		shift ;;
+	-attr)	attributes="$attributes $2"
+		shift; shift;;
+	-debug)	debug=1
+		shift ;;
+	-f)	readfiles=1;
+		shift ;;
+	-J)	jobs="$2"
+		shift; shift ;;
+	-delay)	delay="$2"
+		shift; shift ;;
+	-check-exec)
+		check_exec=1
+		shift ;;
+	-exit-on-error)
+		exit_on_error=1
+		shift ;;
+	-*)	usage
+		exit 2 ;;
+	*)
+		break ;;
+	esac
+done
+
+#
+# parse command
+#
+
+# find tmp file that doesn't exist yet
+for suffix in a b c d e f g h i j k l m n o p q r s t u v x y z
+do
+	mkfile=/tmp/export$$$suffix
+	if [ ! -f $mkfile ]; then
+		break
+    	fi
+done
+
+trap "rm -f $mkfile; exit 1" 1 2 15
+
+#
+# create makefile
+#
+if [ "$#" -eq 0 -o "$readfiles" ]; then
+	# read commands from files or stdin
+	cat "$@"
+else
+	# use what's on the command line
+	echo "$@"
+fi | \
+gawk '
+BEGIN {
+	ld_lib_path_var = "LD_LIBRARY_PATH";
+}
+NR == 1 {
+	# always use /bin/sh for portability across platforms
+	print "SHELL=/bin/sh"
+	print ".cleanup: ; @/bin/rm -f " mkfile
+	
+	jobnum = 0;
+}
+NF > 0 {
+	jobnum ++;
+
+	job = ".job" jobnum;
+	alljobs = alljobs job " ";
+
+	# make sure shell variable expansion is preserved
+	gsub("\\$", "$$");
+
+	delay = delay + 0;
+
+	if (check_exec) {
+		exec_file = "";
+		for (i = 1; i <= NF; i ++) {
+		    if ( $i ~ "^/") {
+			exec_file = $i;
+			break;
+		    }
+		}
+		if (exec_file) {
+		    sub("[;&|].*", "", exec_file);
+		    $0 = "while [ ! -x " exec_file " ]; do sleep 5; done; " $0
+		}
+	}
+
+	if (ld_lib_path_var in ENVIRON) {
+		    $0 = ld_lib_path_var "=" ENVIRON[ld_lib_path_var] "; export " ld_lib_path_var "; " $0
+	}
+
+	if (njobs > 1) {
+		if (delay > 0 && jobnum > 1) {
+			prev_delay_target = delay_target;
+			delay_target = "delay" jobnum;
+			print delay_target ": " prev_delay_target \
+					"; @sleep " delay;
+		} else {
+			delay_target = "";
+		}
+
+		print job ": " delay_target "; " $0;
+	} else {
+		print job ": ; @" $0;
+	}
+	if (makemode) {
+		print "\t@touch " job;
+	}
+}
+END {
+	print "all: " alljobs;
+
+	print alljobs ": .cleanup";
+	
+	if (jobnum == 0) {
+		print "warning: empty command list" > "/dev/stderr";
+	}
+}
+' makemode=$makemode attributes="$attributes" mkfile="$mkfile" \
+	njobs="$jobs" delay="$delay" check_exec=$check_exec \
+	exit_on_error=$exit_on_error > $mkfile
+
+if [ "$debug" ]; then
+	cat $mkfile
+	rm -f $mkfile
+	exit
+fi
+
+# avoid illegal values when make is invoked from other makes
+MAKEFLAGS=
+MFLAGS=
+export MAKEFLAGS MFLAGS
+
+if [ $exit_on_error = 0 ]; then
+	ignoreflag=-k
+fi
+
+exec make -j $jobs $ignoreflag -f $mkfile all
+
--- a/language_model/srilm-1.7.3/utils/src/rover-control-tying.gawk
+++ b/language_model/srilm-1.7.3/utils/src/rover-control-tying.gawk
@@ -0,0 +1,35 @@
+#!/usr/local/bin/gawk -f
+#
+# rover-control-tying --
+#	extract tying information from rover-control file for use with
+#	compute-best-rover-mix tying=...
+#
+
+BEGIN {
+	bin = 0;
+}
+
+/^##/ || /^[ 	]*$/ {
+	# skip comment or empty line
+	next;
+}
+
+$3 == "+" {
+	next;
+}
+
+{
+	if ($4 == "") $4 = 1;
+
+	if ($4 == "=") {
+		output = output " " bin;
+	} else {
+		output = output " " ++bin;
+	}
+}
+
+END {
+	sub("^ ", "", output);
+	print output;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/rover-control-weights.gawk
+++ b/language_model/srilm-1.7.3/utils/src/rover-control-weights.gawk
@@ -0,0 +1,65 @@
+#!/usr/local/bin/gawk -f
+#
+# rover-control-weights --
+#	retrieve or change weights in rover-control file
+#
+# usage:
+#  retrieving
+#	rover-control-weights rover-control 
+#  changing:
+#	rover-control-weights weights="..." rover-control > new-rover-control
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/rover-control-weights.gawk,v 1.3 2017/08/16 06:34:16 stolcke Exp $
+#
+
+NR == 1 {
+	if (weights) {
+	    nweights = split(weights, w);
+	}
+	output_weights = "";
+}
+
+/^##/ || /^[ 	]*$/ {
+	# pass through comment or empty line
+	print;
+	next;
+}
+
+$3 == "+" {
+	if (weights) {
+	    print;
+	}
+	next;
+}
+{
+	# dir lmw wtw weight max_nbest scale 
+	if (weights) {
+	    # fill in missing parameter values
+	    if (NF < 2) $2 = 8;
+	    if (NF < 3) $3 = 0;
+
+	    if (++ sysno <= nweights) {
+		if ($4 == "=" && w[sysno] == w[sysno-1]) {
+		    # preserve weight tying if new weights are compatible
+		    ;
+		} else {
+		    $4 = w[sysno];
+		}
+	    } else {
+		$4 = 1;
+	    }
+	    print;
+	} else {
+	    if (NF < 4) $4 = 1;
+	    output_weights = output_weights " " $4;
+	}
+}
+
+END {
+	if (!weights) {
+	    sub("^ ", "", output_weights);
+	    print output_weights;
+	}
+}
+	
+
--- a/language_model/srilm-1.7.3/utils/src/search-rover-combo
+++ b/language_model/srilm-1.7.3/utils/src/search-rover-combo
@@ -0,0 +1,267 @@
+#!/bin/sh
+#
+# search-rover-combo --
+#	search for best rover combination from a list of systems
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/search-rover-combo,v 1.14 2016-12-10 18:20:33 stolcke Exp $
+#
+
+
+scriptdir=`dirname $0`
+score_script=$scriptdir/score-hyps
+datadir=SEARCH-DATA
+weights="1"
+smooth_weight=
+sentids=-
+njobs=1
+refs=
+
+# collect options
+while [ $# -gt 0 ]; do
+    case "$1" in
+    -rover)	shift
+		run_rover=1
+		break ;;
+    -rover-optimize)	shift
+		run_rover_optimize=1
+		break ;;
+    -scorer)	score_script="$2";
+		shift; shift ;;
+    -weights)	weights="$2";
+		shift; shift ;;
+    -smooth-weight)	
+		smooth_weight="$2";
+		shift; shift ;;
+    -smooth-control)	
+		smooth_control="$2";
+		shift; shift ;;
+    -datadir)	datadir="$2";
+		shift; shift ;; 
+    -sentids)	sentids="$2";
+		shift; shift ;; 
+    -refs)	refs="$2"
+		shift; shift ;; 
+    -J)		njobs=$2
+		shift; shift ;; 
+    -*)		echo "usage: $0 [-scorer SCRIPT] [-weights=\"W1 W2 ...\" | -refs REFS] [-smooth-weight S] [-datadir DIR] [-sentids LIST] LIST-OF-CONTROL-FILES" >&2
+		exit 2 ;;
+    *)		break ;;
+    esac
+done
+
+# see if this is a recursive evaluation to run a single nbest-rover
+if [ -n "$run_rover" ]; then
+	# sentids control-file hyps-out
+	nbest-rover $1 $2 > $3
+	exit
+elif [ -n "$run_rover_optimize" ]; then
+	# sentids control-file hyps-out refs
+	nbest-rover $1 $2 /dev/null > $3-0 2>&1 \
+		-refs $4 -write-ref-posteriors $3.ref-posteriors
+	rm $3-0
+	tying=`rover-control-tying $2`
+	compute-best-rover-mix tying="$tying" $3.ref-posteriors > $3.optimize 2>&1
+
+	weights=`${GAWK-gawk} '/best lambda/ { sub(".*[(]", "", $0); sub("[)]", "", $0); print }' $3.optimize `
+
+	rover-control-weights weights="$weights" $2 > $2.optimized1
+
+	if [ -n "$smooth_weight" -a -n "$smooth_control" ]; then
+	    combine-rover-controls keeppaths=1 lambda=$smooth_weight $smooth_control $2.optimized1 > $2.optimized
+	else
+	    mv $2.optimized1 $2.optimized
+	fi
+
+	nbest-rover $1 $2.optimized > $3
+	exit
+fi
+
+rexport=${REXPORT-rexport.gnumake -exit-on-error -J $njobs -f}
+
+input_list=${1-SYSTEM-LIST}
+# backward compatibility for 2nd argument
+score_script=${2-$score_script}
+# backward compatibility for 3rd argument
+datadir=${3-$datadir}
+
+set -e
+
+mkdir -p $datadir
+
+
+#
+# Step 1:  compute errors for individual systems
+#
+
+system_errors=$datadir/system-errors
+cmdlist=$datadir/score.rexports
+
+tmpctrl=$datadir/tmp.control
+tmphyps=$datadir/tmp.hyps
+tmpscore=$datadir/tmp.score
+
+sort $input_list > $datadir/sorted_inputs
+
+iter=0
+iterdir=$datadir/$iter
+mkdir -p $iterdir
+
+system_errors=$iterdir/system_errors
+
+if [ ! -s $system_errors ]; then
+    count=1
+    > $cmdlist
+
+    cat $datadir/sorted_inputs | \
+    while read roverctrl
+    do
+	# rewrite rover control file to adjust directory paths
+	combine-rover-controls $roverctrl > $tmpctrl.$count
+
+	echo "$0 -rover $sentids $tmpctrl.$count $tmphyps.$count; \
+	      echo $roverctrl \`$score_script $tmphyps.$count\` > $tmpscore.$count" >> $cmdlist
+
+	count=`expr $count + 1`
+    done
+
+    # run the scoring jobs
+    if [ $njobs -lt 2 ]; then
+	sh -ex $cmdlist >$cmdlist.log 2>&1
+    else
+        $rexport $cmdlist >$cmdlist.log 2>&1
+    fi
+    sort +0 -1 $tmpscore.* > $system_errors
+
+    rm -f $tmpctrl.* $tmphyps.* $tmpscore.*
+fi # system_errors exists
+
+best_system=`sort +1n -2 $system_errors | ${GAWK-gawk} '{ print $1; exit }' `
+best_error=`sort +1n -2 $system_errors | ${GAWK-gawk} '{ print $2; exit }' `
+
+echo "FIRST SYSTEM" >&2
+echo $best_system >&2
+echo "ERROR $best_error" >&2
+
+echo "$best_system 1" > $iterdir/combo
+join -v 1 $datadir/sorted_inputs $iterdir/combo > $iterdir/unused
+cat $best_system > $iterdir/rover.control
+
+tryall=yes
+
+# if weigh testimation is used it we always add the new system at a fixed lower weight
+# than the sum of prior systems
+if [ -n "$refs" ]; then
+    weights=0.5
+fi
+
+while [ -s $iterdir/unused ]
+do
+    newiter=`expr $iter + 1`
+    newiterdir=$datadir/$newiter
+    mkdir -p $newiterdir
+
+    echo "ITER $newiter" >&2
+
+    system_errors=$newiterdir/system_errors
+
+    if [ ! -s $system_errors ]; then
+
+	for weight in $weights
+	do
+	    count=1
+	    > $cmdlist
+
+	    cat $iterdir/unused | \
+	    while read roverctrl
+	    do
+		combine-rover-controls keeppaths=1 lambda="1 $weight" $iterdir/rover.control $roverctrl > $tmpctrl.$count
+
+		if [ -n "$refs" ]; then
+		    # evaluate rover control file with weight optimization
+		    if [ -n "$smooth_weight" ]; then
+			smooth="-smooth-weight $smooth_weight -smooth-control $iterdir/rover.control"
+		    fi
+		    echo "$0 $smooth -rover-optimize $sentids $tmpctrl.$count $tmphyps.$count $refs; \
+		          echo $roverctrl $weight \`$score_script $tmphyps.$count\` $tmpctrl.$count.optimized > $tmpscore.$count" >> $cmdlist
+		else
+		    # evaluate rover control file without weight optimization
+		    echo "$0 -rover $sentids $tmpctrl.$count $tmphyps.$count; \
+		          echo $roverctrl $weight \`$score_script $tmphyps.$count\` $tmpctrl.$count > $tmpscore.$count" >> $cmdlist
+		fi
+
+		count=`expr $count + 1`
+	    done
+
+	    # run the scoring jobs
+	    if [ $njobs -lt 2 ]; then
+		sh -ex $cmdlist >$cmdlist.log 2>&1
+	    else
+		$rexport $cmdlist >$cmdlist.log 2>&1
+	    fi
+	    sort +0 -1 $tmpscore.* > $system_errors
+
+	    ${GAWK-gawk} -v old_error=$best_error '$3 < old_error' $system_errors > $system_errors.improved
+
+	    if [ -s $system_errors.improved ]; then
+		# we found at least one improvement; stop trying weights
+		break;
+	    fi
+	done
+    else
+	# restart search at this iteration
+        ${GAWK-gawk} -v old_error=$best_error '$3 < old_error' $system_errors > $system_errors.improved
+    fi
+
+    if [ -s $system_errors.improved ]; then
+	best_system=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $1; exit }' `
+	best_weight=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $2; exit }' `
+	best_error=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $3; exit }' `
+	best_control=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $4; exit }' `
+
+	echo "NEXT SYSTEM" >&2
+	echo "$best_system $best_weight" >&2
+	echo "ERROR $best_error" >&2
+
+	if [ ! -s $newiterdir/rover.control ]; then
+	    cat $best_control > $newiterdir/rover.control
+	fi
+
+	{ cat $iterdir/combo; echo "$best_system $best_weight"; } | sort +0 -1 > $newiterdir/combo
+	${GAWK-gawk} '{ print $1 }' $system_errors.improved | \
+	join -v 1 - $newiterdir/combo > $newiterdir/unused
+
+	tryall=yes
+    else
+	cat $iterdir/combo > $newiterdir/combo
+	cat $iterdir/rover.control > $newiterdir/rover.control
+    fi
+
+    rm -f $tmpctrl.* $tmphyps.* $tmpscore.*
+
+    if [ ! -s $newiterdir/unused -a "$tryall"  ]; then
+	
+	# no improvement -- add all previously discarded systems back into the running
+	echo "EXPANDING SEARCH" >&2
+
+	if [ ! -f $newiterdir/combo ]; then
+	    # try extending the same combo again in next iteration
+	    cat $iterdir/combo > $newiterdir/combo
+	    cat $iterdir/rover.control > $newiterdir/rover.control
+	fi
+
+	join -v 1 $datadir/sorted_inputs $newiterdir/combo > $newiterdir/unused
+
+	# do this only once until we can add a new system 
+	tryall=
+    fi
+
+    iter=$newiter
+    iterdir=$newiterdir
+done
+
+echo "BEST COMBO" >&2
+cat $iterdir/combo >&2
+echo "ERROR $best_error" >&2
+
+cat $iterdir/rover.control
+
--- a/language_model/srilm-1.7.3/utils/src/select-vocab.pl
+++ b/language_model/srilm-1.7.3/utils/src/select-vocab.pl
@@ -0,0 +1,551 @@
+#!/usr/bin/perl
+#
+# Usage: select-vocab [-quiet] -heldout file f1 f2 ... fn 
+#
+# Selects a vocabulary from the union of the vocabularies of f1
+# through fn that maximizes the likelihood of the heldout file.  f1
+# through fn can either be text files, count files or ARPA-style
+# back-off language models.  If they are text files, further,
+# each line in them can optionally be prefixed by a sentence id, which
+# will be stripped if the file has the .sentid extension.
+#
+# Note:  This implementation corrects an error in the paper [1].  The
+# EM procedure specification in [1] describes corpus level interpolation.
+# But we use word-level interpolation.
+#
+# Authors: Anand Venkataraman and Wen Wang
+# STAR Lab, SRI International, Menlo Park, CA 94025, USA.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/select-vocab.pl,v 1.7 2013/04/05 16:50:56 stolcke Exp $
+#
+
+# Globals
+my $Quiet = 0; # Quiet or Verbose?
+my $Gzip = 0;  # Do we have Gzip?
+
+MAIN: {
+  my $heldOut = "";      # Filename of the heldout corpus
+  my $maxIter = 500;     # Perform a maximum of this many EM iters
+  my $precision = 1e-5;  # Stop EM iterations when log likelihood changes less than this much
+  my $scale = 1e6;       # Scale final output counts by this much
+      
+  while ($arg = shift(@ARGV)) {
+    if ($arg =~ /^-h(elp)?$/) {
+      usage();
+    } elsif ($arg =~ /^-held(-)?(out)?$/) {
+      $heldOut = shift(@ARGV);
+    } elsif ($arg =~ /^-scale(-)?(counts)?$/) {
+      $scale = shift(@ARGV);
+    } elsif ($arg =~ /^-q(uiet)?$/) {
+      $Quiet = 1;
+    } elsif ($arg =~ /^-/) {
+      print STDERR "Unknown option: $arg\n";
+      usage();
+    } else {
+      unshift(@ARGV, $arg);
+      last;
+    }
+  }
+  die "$0: I need a held out corpus (-heldout) to maximize likelihood.\n" if ($heldOut eq "");
+  die "$0: I need at least two corpora to combine vocabulary counts.\n" if ($#ARGV < 1);
+
+  # Determine whether gzip exists in the path
+  #
+  if (system("sh -c 'gzip -help' >/dev/null 2>&1") == 0) {
+    message("I found gzip in your path.  So I'll support compressed input.\n");
+    $Gzip=1;
+  } else {
+    message("I didn't find gzip in your path.  So I won't support compressed input.\n");
+    $Gzip=0;
+  }
+
+  # Make held-out counts and calculate total number of tokens.
+  #
+  my $heldOut_counts_ref = make_raw_counts($heldOut);
+  my $numWords = 0;
+  foreach my $word (keys %{$heldOut_counts_ref}) {
+    $numWords += $heldOut_counts_ref->{$word};
+  }
+  die "$0: The held-out corpus must not be empty.\n" if ($numWords == 0);
+
+
+  # The grand vocab is a union of all possible words, including in the Heldout set.
+  #
+  my $vocab = make_full_vocab($heldOut, @ARGV);
+
+  # Create log distributions for each of the (n > 1) corpora.  The counts
+  # will all use a common vocabulary that is the union of the individual
+  # vocabularies.  Use Witten-Bell discounting to handle zero-frequency
+  # items in the normalization process.
+  #
+  for (my $n = 0; $n <= $#ARGV; $n++) {
+    $lambda[$n] = 1/($#ARGV+1);
+    $logprobs_refs[$n] = estimate_logprobs($ARGV[$n], $vocab);
+  }
+  message("Iter 0: lambdas = (@lambda)\n");
+
+  # Now perform EM.  Iterate to increase the likelihood of the heldout set.
+  # Procedure halts when the likelihood changes by less than $precision
+  # after an iteration.  See Eqns. (3)-(6) of Venkataraman & Wang, 2003.
+  #
+  $done = 0;
+  $iter = 0;
+  while (!$done && $iter < $maxIter) {
+    $done = 1;
+    $iter++;
+
+    my $loglike = 0;
+    @post_totals = ();
+
+    # Calculate log lambdas.
+    # 
+    for (my $n = 0; $n <= $#ARGV; $n++) {
+      $log_lambda[$n] = log($lambda[$n]);
+    }
+
+    # Estimate lambdas per word and average over all words.
+    #
+    foreach my $word (keys %{$heldOut_counts_ref}) {
+      undef $log_numer_sum;
+
+      for (my $n = 0; $n <= $#ARGV; $n++) {
+        $log_numer[$n] = $log_lambda[$n] + $logprobs_refs[$n]->{$word};
+        $log_numer_sum = logsum($log_numer_sum, $log_numer[$n]);
+      }
+      $loglike += $log_numer_sum * $heldOut_counts_ref->{$word};
+
+      for (my $n = 0; $n <= $#ARGV; $n++) {
+        $post_totals[$n] += exp($log_numer[$n] - $log_numer_sum) * $heldOut_counts_ref->{$word};
+      }
+    }
+
+    for (my $n = 0; $n <= $#ARGV; $n++) {
+      $lambda_prime[$n] = $post_totals[$n]/$numWords;
+      $delta[$n] = abs($lambda_prime[$n] - $lambda[$n]);
+      $done = 0 if ($delta[$n] > $precision);
+    }
+
+    @lambda = @lambda_prime;
+
+    next if $Quiet;
+    for (my $n = 0; $n <= $#lambda_prime; $n++) {
+      $lambda_trunc[$n] = sprintf("%0.6f", $lambda[$n]);
+    }
+    my $ppl_trunc = sprintf("%.4f", exp(-$loglike/$numWords));
+    my $loglike_trunc = sprintf("%.4f", $loglike);
+    message("Iter $iter: lambdas = (@lambda_trunc) log P(held-out) = $loglike_trunc PPL = $ppl_trunc\n");
+  }
+
+  # Compute the combined counts.
+  #
+  message("Combining counts.\n");
+  undef %counts;
+  foreach my $word (keys %{$vocab}) {
+    for (my $n = 0; $n <= $#ARGV; $n++) {
+      $counts{$word} += $lambda[$n] * exp($logprobs_refs[$n]->{$word});
+    }
+  }
+
+  # Print out the final vocab with the combined counts scaled by $scale.
+  #
+  foreach my $word (keys %counts) {
+    my $score = $counts{$word} * $scale;
+    print "$word\t $score\n";
+  }
+
+  exit(0);
+}
+
+#----------------------------------------------------------------------
+
+# Return a ref to a hash of normalized counts.  Use the given vocabulary
+# and Witten-Bell (1991) smoothing to ensure non-zero probabilities.
+#
+sub estimate_logprobs {
+  my($f, $voc_ref) = @_;
+
+  message("Estimating logprobs for $f. ");
+
+  my $counts_ref = make_raw_counts($f);
+
+  my $sumcounts = 0;
+  foreach my $word (keys %{$counts_ref}) {
+    $sumcounts += $counts_ref->{$word};
+  }
+
+  # Compute the number of "novel" words. i.e. words in vocab, but
+  # not in counts.
+  #
+  my $vocabsize = scalar keys %{$voc_ref};
+  my $nwords = scalar keys %{$counts_ref};
+  my $num_novel = $vocabsize - $nwords;
+  message("It has all but $num_novel vocabulary words.\n");
+
+  # If there are no novel words, just normalize and return;
+  #
+  if (!$num_novel) {
+    foreach my $word (keys %{$counts_ref}) {
+      $counts_ref->{$word} = log($counts_ref->{$word}) - log($sumcounts);
+    }
+    return $counts_ref;
+  }
+
+  # Create keys for novel words.
+  #
+  foreach my $word (keys %{$voc_ref}) {
+    $counts_ref->{$word} += 0;
+  }
+
+  # If the sum of the counts is less than zero, we probably got them from a
+  # language model that already smoothed the unigram counts.  So we use the left over
+  # mass for novel words.  Otherwise, if the sum is equal to 1, we rescale the
+  # probabilities by 0.9 (until a better way can be found), and use the remaining
+  # mass to distribute.  If the counts are > 1, then we perform smoothing ourselves.
+  #
+  if ($sumcounts < 1) {
+    my $novel_mass = 1-$sumcounts;
+    message("\tSum of counts in $f is only $sumcounts\n");
+    message("\tWill distribute probabilty mass of $novel_mass over novel words\n");
+    my $novel_logprob = log(1-$sumcounts) - log($num_novel);
+    foreach my $word (keys %{$counts_ref}) {
+      if ($counts_ref->{$word}) {
+	$counts_ref->{$word} = log($counts_ref->{$word});
+      } else {
+	$counts_ref->{$word} = $novel_logprob;
+      }
+    }
+    return $counts_ref;
+  }
+
+  if ($sumcounts == 1) {
+    message("\tSum of counts in $f is exactly 1\n");
+    message("\tWill scale them by 0.9 and use 0.1 for novel words.\n");
+    my $novel_logprob = log(0.1/$num_novel);
+    foreach my $word (keys %{$counts_ref}) {
+      if ($counts_ref->{$word}) {
+	$counts_ref->{$word} = log($counts_ref->{$word} * 0.9);
+      } else {
+	$counts_ref->{$word} = $novel_logprob;
+      }
+    }
+    return $counts_ref;
+  }
+
+  # Normalize and smooth.  Note that in calculating the probability of novel words,
+  # the Witten-Bell estimate for the novel event is $nwords/($sum_counts+$nwords).
+  # This mass is shared equally by each of the novel words and hence $num_novel in
+  # the denominator.
+  #
+  foreach my $word (keys %{$counts_ref}) {
+    if ($counts_ref->{$word}) {
+      $counts_ref->{$word} = log($counts_ref->{$word}/($sumcounts + $nwords));
+    } else {
+      $counts_ref->{$word} = log($nwords) - log($sumcounts + $nwords) - log($num_novel);
+    }
+  }
+
+  return $counts_ref;
+}
+
+#---------------------------------------------------------------------------
+# The following subroutines construct the vocabulary from various kinds
+# of input files.
+#
+sub make_full_vocab {
+  my @files = @_;
+  my %voc;
+
+  foreach my $f (@files) {
+    $ftype = getftype($f);
+    if ($ftype eq "text") {
+      message("Adding words from text file $f into vocabulary.\n");
+      add_vocab_from_text(\%voc, $f);
+    } elsif ($ftype eq "sentid") {
+      message("Adding words from sentID file $f into vocabulary.\n");
+      add_vocab_from_sentid(\%voc, $f);
+    } elsif ($ftype eq "counts") {
+      message("Adding words from counts file $f into vocabulary.\n");
+      add_vocab_from_counts(\%voc, $f);
+    } elsif ($ftype eq "arpa-lm") {
+      message("Adding words from ARPA-style LM file $f into vocabulary.\n");
+      add_vocab_from_lm(\%voc, $f);
+    } else {
+      die "I don't know the file type for $f.  Giving up.\n";
+    }
+  }
+  return \%voc;
+}
+
+sub add_vocab_from_text {
+  my($voc_ref, $f) = @_;
+
+  my $in = zopen($f);
+  while (my $line = <$in>) {
+    my @words = split(/\s+/, $line);
+    foreach my $word (@words) {
+      $voc_ref->{$word} = 0;
+    }
+  }
+  close($in);
+}
+
+# Same as above, but gets rid of sentid (first word on each line)
+#
+sub add_vocab_from_sentid {
+  my($voc_ref, $f) = @_;
+
+  my $in = zopen($f);
+  while (my $line = <$in>) {
+    my @words = split(/\s+/, $line);
+    shift(@words);                  # Toss sentid
+    foreach my $word (@words) {
+      $voc_ref->{$word} = 0;
+    }
+  }
+  close($in);
+}
+ 
+# Same as above, but only uses the first word of each line.  Each line
+# in a count file will have two fields -- word count
+#
+sub add_vocab_from_counts {
+  my($voc_ref, $f) = @_;
+
+  my $in = zopen($f);
+  while (my $line = <$in>) {
+    my @fields = split(/\s+/, $line);
+    next if $line =~ /^\s*$/ || $#fields > 1;   # Ignore non-unigram counts
+    next if $fields[0] =~ /<.*>/;               # Skip pseudo words.
+    $voc_ref->{$fields[0]} = 0;
+  }
+  close($in);
+}
+ 
+# Same as above, but only takes probabilities from the unigram
+# portion of the arpa-format lm.
+#
+sub add_vocab_from_lm {
+  my($voc_ref, $f) = @_;
+
+  my $in = zopen($f);
+
+  # Locate unigram section
+  while (my $line = <$in>) {
+    last if $line =~ /^\\1-grams:/;
+  }
+
+  # Read unigrams into vocab
+  while (my $line = <$in>) {
+    last if /^\\2-grams:/;
+    my ($logprob, $word, @rest) = split(/\s+/, $line);
+    next if $word =~ /(^\s*$)|(<.*>)/;                 # Skip pseudo words.
+    $voc_ref->{$word} = 0;
+  }
+
+  close($in);
+}
+ 
+#----------------------------------------------------------------------
+# The following subroutines are very similar to the ones above.
+# They return a ref to a hash of unnormalized counts from various kinds
+# of input files.
+#
+sub make_raw_counts {
+  my($f) = @_;
+
+  $ftype = getftype($f);
+  if ($ftype eq "text") {
+    return make_raw_counts_from_text($f);
+  } elsif ($ftype eq "sentid") {
+    return make_raw_counts_from_sentid($f);
+  } elsif ($ftype eq "counts") {
+    return make_raw_counts_from_counts($f);
+  } elsif ($ftype eq "arpa-lm") {
+    return make_raw_counts_from_lm($f);
+  } else {
+    die "I don't know the file type for $f. Giving up.\n";
+  }
+}
+
+sub make_raw_counts_from_text {
+  my($f) = @_;
+  my %counts;
+
+  my $in = zopen($f);
+  while (my $line = <$in>) {
+    my @words = split(/\s+/, $line);
+    foreach my $word (@words) {
+      $counts{$word}++;
+    }
+  }
+  close($in);
+  return \%counts;
+}
+
+sub make_raw_counts_from_sentid {
+  my($f) = @_;
+  my %counts;
+
+  my $in = zopen($f);
+  while (my $line = <$in>) {
+    my @words = split(/\s+/, $line);
+    shift (@words); # Toss sentid
+    foreach my $word (@words) {
+      $counts{$word}++;
+    }
+  }
+  close($in);
+  return \%counts;
+}
+
+sub make_raw_counts_from_counts {
+  my($f) = @_;
+  my %counts;
+
+  my $in = zopen($f);
+  while (my $line = <$in>) {
+    my @fields = split(/\s+/, $line);
+    next if $line =~ /^\s*$/ || $#fields > 1;    # Ignore non-unigram counts.
+    next if $fields[0] =~ /<.*>/;                # Skip pseudo words.
+    $counts{$fields[0]} += $fields[1];
+  }
+  close($in);
+  return \%counts;
+}
+
+# Well, the counts from the lm aren't going to be raw.  We just have to
+# settle for the normalized counts.
+#
+sub make_raw_counts_from_lm {
+  my($f) = @_;
+  my %counts;
+
+  my $in = zopen($f);
+
+  # Locate unigram section
+  while (my $line = <$in>) {
+    last if $line =~ /^\\1-grams:/;
+  }
+
+  # Read in unigram counts
+  while (my $line = <$in>) {
+    last if $line =~ /^\\2-grams:/;
+    my ($logprob, $word) = split(/\s+/, $line);
+    next if $word =~ /(^\s*$)|(<.*>)/;                 # Skip pseudo words.
+    $counts{$word} += 10**$logprob;
+  }
+  close($in);
+
+  return \%counts;
+}
+
+#---------------------------------------------------------------------------
+
+sub getftype {
+  my($f) = @_;
+
+  # First check if it is a sentid file.  If necessary insert further checks
+  # by looking into the file.
+  #
+  return "sentid" if ($f =~ /\.sentid(\.gz|\.Z)?$/);
+
+  # Extract the first five lines from the file to make our decision.
+  #
+  my $in = zopen($f);
+  for (my $i = 0; $i < 5; $i++)  {
+    $lines[$i] = <$in> || last;
+  }
+  close($in);
+  
+  # Is it a count file?  Assume it is and try to falsify from the 
+  # first 5 lines. Format should be -- word count \n
+  #
+  my $countfile = 1;
+  for (my $i = 0; $i < 5; $i++)  {
+    my @words = split(/\s+/, $lines[$i]);
+    if ($words[$#words] !~ /\d+/) {
+      $countfile = 0;
+      last;
+    }
+  }
+  return "counts" if ($countfile == 1);
+
+  # Is it an arpa-style language model?
+  #
+  my $s = join(' ', @lines);
+  return "arpa-lm" if ($s =~ /\s*\\data\\\s*ngram\s+1\s*=/);
+
+  # Otherwise, assume it is a text file.
+  #
+  return "text";
+}
+
+# Given log(x) and log(y), this function returns log(x+y).
+#
+sub logsum {
+  my($x,$y) = @_;
+  my $z;
+  
+  if (!defined($x)) {
+    $z = $y;
+  } elsif (!defined($y)) {
+    $z = $x;
+  } else {
+    $z = ($x < $y)? logsum($y,$x) : $x + log(1+exp($y-$x));
+  }
+  return $z;
+}
+
+sub message {
+  my($msg) = @_;
+
+  return if ($Quiet);
+  print STDERR "$msg";
+}
+
+# Opens a possibly compressed file.  Only uncomment the gzip line
+# if gzip is available.  Otherwise, compressed files aren't supported.
+#
+sub zopen {
+  my($f) = @_;
+  local *IN;
+  
+  die "$f is not a file.\n" if ! -f $f;
+
+  if (!$Gzip) {
+    open(IN, $f) || die "$f: $!\n";
+  } else {
+    open(IN, "gzip -dfc $f |") || die "gzip -dfc $f: $!\n";
+  }
+
+  return *IN;
+}
+
+sub usage {
+  print STDERR <<"  .;";
+
+    Usage:
+      $0 [-quiet] [-scale n] -heldout corp_h corp1 corp2 ...
+
+    Estimate weighted and combined counts for the words in the vocabulary.
+    These weights maximize the likelihood of the heldout corpus, corp_h, by
+    the Witten-Bell smoothed mixture unigram language models from corp_1 through
+    corp_n.
+
+    -quiet stops debug style messages while running.
+    -scale n causes final combined counts to be scaled by n.
+
+  .;
+
+  exit 1;
+}
+
+#---------------------------------------------------------------------------------
+# References.
+#
+# 1. Venkataraman, A. and W. Wang, (2003).  "Techniques for effective vocabulary
+#    selection", in Proceedings of Eurospeech'03, Geneva, 2003.
+#
+# 2. Witten, I. H. and T. C. Bell, (1991). "The zero-frequency problem:
+#    Estimating the probabilities of novel events in adaptive text compression",
+#    IEEE Trans. IT, 37, pp. 1085-1091.
--- a/language_model/srilm-1.7.3/utils/src/sentid-to-ctm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/sentid-to-ctm.gawk
@@ -0,0 +1,145 @@
+#!/usr/local/bin/gawk -f
+#
+# sentid-to-ctm --
+#	Format a sentid transcript file into CTM format, faking time marks
+#	by spacing words evenly across the duration of the segment
+#
+#	Note: this script makes assumptions about the structure of sentence
+#	ID, specifically, how they encode speakers and timemarks.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/sentid-to-ctm.gawk,v 1.11 2019/02/09 07:31:37 stolcke Exp $
+#
+
+BEGIN {
+	# time to leave at edges of segments
+	delta = 0.07;
+
+	pause = "-pau-";
+	reject = "@reject@";
+
+	sort_cmd = "sort -b -k 1,1 -k 2,2 -k 3,3n";
+}
+
+# read confidences and/or segment information if given
+NR == 1 {
+	if (confidences) {
+		while ((getline line < confidences) > 0) {
+			nvalues = split(line, a);
+			if (nvalues > 0) {
+				conf_lines[a[1]] = line;
+			}
+		}
+	}
+
+	if (segments) {
+		while ((getline line < segments) > 0) {
+			nvalues = split(line, a);
+			if (nvalues == 5) {
+				sentid = a[1];
+				segment_conv[sentid] = a[2];
+				segment_channel[sentid] = a[3];
+				segment_start[sentid] = a[4];
+				segment_end[sentid] = a[5];
+			}
+		}
+		close(segments);
+	}
+}
+
+function is_nonspeech(w) {
+	return w == pause || w == reject || w ~/^\[.*\]$/ || w ~/^<.*>$/;
+}
+
+{
+	orig_sentid = sentid = $1;
+
+	# strip speaker diacritics
+	sub("_s[1-9]$", "", sentid);
+
+	if (segments && sentid in segment_start) {
+	   conv = segment_conv[sentid];
+	   channel = segment_channel[sentid];
+	   start_offset = segment_start[sentid];
+	   end_offset = segment_end[sentid];
+	# derive channel and time information from sentids
+	# look for a pattern that encodes channel and 
+	# start/end times
+	} else if (match(sentid, "_[0-9]_[-0-9][0-9]*_[0-9][0-9]*$")) {
+	   # waveforms with [012] channel id, timemarks 1/1000s
+	   # NOTE: this form is used by the segmenter
+	   conv = substr(sentid, 1, RSTART-1);
+	   split(substr(sentid, RSTART+1), sentid_parts, "_");
+	   channel = sentid_parts[1];
+	   start_offset = sentid_parts[2] / 1000;
+	   end_offset = sentid_parts[3] / 1000;
+	} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*$")) {
+	   conv = substr(sentid, 1, RSTART-1);
+	   split(substr(sentid, RSTART+1), sentid_parts, "_");
+	   channel = sentid_parts[1];
+	   start_offset = sentid_parts[2] / 100;
+	   end_offset = sentid_parts[3] / 100;
+	# new sentids used by Ramana for SPINE segmentations
+	} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*_[-0-9][0-9]*_[0-9][0-9]*$")) {
+	   conv = substr(sentid, 1, RSTART-1);
+	   split(substr(sentid, RSTART+1), sentid_parts, "_");
+	   channel = sentid_parts[1];
+	   start_offset = (sentid_parts[2]+sentid_parts[4]) / 100;
+	   end_offset = (sentid_parts[2]+sentid_parts[5]) / 100;
+	} else {
+	   print "cannot parse sentid " sentid >> "/dev/stderr";
+	   conv = sentid;
+	   channel = "?";
+	   start_offset = 0;
+	   end_offset = 10000;
+	}
+
+	$1 = "";
+	$0 = $0;
+
+	numwords = NF;
+
+	if (numwords > 0) {
+	    word_dur = (end_offset - start_offset - 2 * delta)/numwords;
+	} else {
+	    word_dur = 0;
+	}
+
+	# find confidence values for this sentid
+	if (confidences) {
+		if (!(orig_sentid in conf_lines)) {
+		    print "no confidences for " orig_sentid >> "/dev/stderr";
+		} else {
+		    delete conf_values;
+		    n_conf_values = \
+			split(conf_lines[orig_sentid], conf_values);
+		}
+	}
+
+	for (i = 1; i <= numwords; i ++) {
+		if (is_nonspeech($i)) continue;
+
+		start_time = start_offset + delta + (i - 1) * word_dur;
+
+		if (i + 1 in conf_values) {
+			conf_value = conf_values[i + 1];
+		} else {
+			conf_value = 0;
+		}
+
+		# split multiwords
+		ncomps = split($i, word_comps, "_");
+
+		for (j = 1; j <= ncomps; j ++) {
+			print conv, channel, \
+				start_time + (j - 1) * word_dur/ncomps,\
+				word_dur/ncomps, \
+				toupper(word_comps[j]), \
+				conf_value | sort_cmd;
+		}
+	}
+
+	if (orig_sentid in conf_lines && numwords != n_conf_values - 1) {
+	    print "mismatched number of confidences for " orig_sentid \
+						>> "/dev/stderr";
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/sentid-to-sclite.gawk
+++ b/language_model/srilm-1.7.3/utils/src/sentid-to-sclite.gawk
@@ -0,0 +1,60 @@
+#!/usr/local/bin/gawk -f
+#
+# sentid-to-sclite --
+#	convert sentid transcription format to sclite 'trn' format
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/sentid-to-sclite.gawk,v 1.5 2016/09/23 20:05:51 stolcke Exp $
+#
+# i.e.:
+#	sentid word1 word2 ....
+#
+# becomes
+#
+#	word1 word2 ... (sentid)
+#
+# The sentid is formatted to contain exactly one underscore,
+# as sclite uses the first portion of the id as a speaker label to
+# group results.
+#
+BEGIN {
+    format_sentids = 1;
+}
+
+{
+    sentid = $1;
+    $1 = "";
+
+    if (format_sentids) {
+	# reformat sentid
+
+	# <conv>_<channel>_<utterance> -> <conv><channel>_<utterance>
+	sub("[-_]A", "A", sentid);
+	sub("[-_]B", "B", sentid);
+	sub("[-_]ch1", "ch1", sentid);
+	sub("[-_]ch2", "ch2", sentid);
+
+	# remove underscore after corpus tag, if any
+	if (sentid ~ /^[a-z][a-z]*[-_][0-9]/) {
+	    sub("[-_]", "", sentid);
+	}
+
+	# <conv>_<channel>_<utterance> -> <conv><channel>_<utterance>
+	sub("[-_]A", "A", sentid);
+	sub("[-_]B", "B", sentid);
+	sub("[-_]ch1", "ch1", sentid);
+	sub("[-_]ch2", "ch2", sentid);
+
+	# work around problems with negative start times in sentids
+	sub("_-", "_m", sentid);
+
+	#
+	# for sentid not containing _ or -, fake a speaker id out of the first
+	# three characters (this works for ATIS ...)
+	#
+	if (! (sentid ~ /[-_]/)) {
+	    sentid = substr(sentid, 1, 3) "_" sentid;
+	}
+    }
+
+    print $0, "(" sentid ")";
+}
--- a/language_model/srilm-1.7.3/utils/src/sort-lm.gawk
+++ b/language_model/srilm-1.7.3/utils/src/sort-lm.gawk
@@ -0,0 +1,56 @@
+#!/usr/local/bin/gawk -f
+#
+# sort-lm --
+#	sort the ngrams in an LM in lexicographic order, as required for 
+#	some other LM software (notably CMU's).
+#
+# usage: sort-lm lm-file > sorted-lm-file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/sort-lm.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
+#
+
+BEGIN {
+	sorter = "";
+	currorder = 0;
+}
+NF==0 {
+	print;
+	next;
+}
+/^ngram *[0-9][0-9]*=/ {
+	order = substr($2,1,index($2,"=")-1);
+	print;
+	next;
+}
+/^\\[0-9]-grams:/ {
+	if (sorter) {
+	    close(sorter);
+	}
+
+	currorder = substr($0,2,1);
+	print;
+	fflush();
+
+	# set up new sorting pipeline;
+	sorter = "sort";
+	for (i = 1; i <= currorder; i ++) {
+		sorter = sorter " +" i " -" (i+1);
+	}
+	# print sorter >> "/dev/stderr";
+	next;
+}
+/^\\/ {
+	if (sorter) {
+	    close(sorter);
+	    sorter = "";
+	}
+	currorder = 0;
+	print; next;
+}
+currorder && NF > 1 {
+	print | sorter;
+	next;
+}
+{	
+	print;
+}
--- a/language_model/srilm-1.7.3/utils/src/split-tagged-ngrams.gawk
+++ b/language_model/srilm-1.7.3/utils/src/split-tagged-ngrams.gawk
@@ -0,0 +1,57 @@
+#!/usr/local/bin/gawk -f
+#
+# split-tagged-ngrams --
+#	multiply tagged-word ngrams out into ngrams that contain
+#	combinations of words and tags
+#
+# sample input:
+#	a/A b/B 10
+# sample output:
+#	a b 10
+#	a B 10
+#	A b 10
+#	A B 10
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/split-tagged-ngrams.gawk,v 1.2 2006/02/11 01:31:32 stolcke Exp $
+#
+
+BEGIN {
+	separator = "/";
+}
+
+# recursive expansion of the tagged-word ngram
+function expand_ngram(ng, n, suffix, c,
+				word, tag, word_tag) {
+	if (n == 0) {
+		print suffix, c;
+	} else {
+		last_item = ng[n];
+
+		if (split(last_item, word_tag, separator) == 2) {
+			word = word_tag[1];
+			tag = word_tag[2];
+			expand_ngram(ng, n-1, word " " suffix, c);
+			expand_ngram(ng, n-1, tag " " suffix, c);
+		} else {
+			expand_ngram(ng, n-1, last_item " " suffix, c);
+		}
+	}
+}
+
+NF > 1 {
+	count = $NF;
+
+	delete ngram;
+	for (i = 1; i < NF; i ++) {
+		ngram[i] = $i;
+	}
+
+	expand_ngram(ngram, NF - 1, "", count);
+
+	next;
+}
+
+{
+	print;
+}
+
--- a/language_model/srilm-1.7.3/utils/src/subset-context-ngrams.gawk
+++ b/language_model/srilm-1.7.3/utils/src/subset-context-ngrams.gawk
@@ -0,0 +1,42 @@
+#!/usr/local/bin/gawk -f
+#
+# subset-context-ngrams -- 
+#	Extract counts corresponding to ngram contexts
+#
+#	usage: subset-context-ngrams contexts=FILE COUNTS > SUBSET
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/subset-context-ngrams.gawk,v 1.1 2008/09/30 03:54:05 stolcke Exp $
+#
+
+# read contexts
+NR == 1 {
+	saveline = $0;
+
+	if (contexts != "") {
+	    howmany = 0;
+	    while ((getline < contexts) > 0) {
+		if (NF < 2) continue;
+		$NF = "";
+		subset_contexts[$0 FS] = 1;
+		howmany ++;
+	    }
+	    print "read " howmany " contexts" > "/dev/stderr";
+	}
+
+	$0 = saveline;
+}
+
+NF == 2 {
+	print;
+	next;
+}
+
+NF > 2 {
+	saveline = $0;
+
+	$NF = $(NF-1) = "";
+	if ($0 in subset_contexts) {
+		print saveline;
+	}
+}
+
--- a/language_model/srilm-1.7.3/utils/src/subtract-ppls.gawk
+++ b/language_model/srilm-1.7.3/utils/src/subtract-ppls.gawk
@@ -0,0 +1,44 @@
+#!/usr/local/bin/gawk -f
+#
+# subtract-ppls --
+#	Subtracts text statistics (from -ppl output)
+#
+#	The first input file contains a total, from which subsequent stats are
+#	discounted.  The result is printed in a format compatible with -ppl.
+#
+# Copyright (c) 1995, SRI International.  All Rights Reserved
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/subtract-ppls.gawk,v 1.2 1997/07/12 05:01:08 stolcke Exp $
+#
+/^file .*: .* sentences/ {
+	if (ARGIND == 1) {
+		totalsents = $3;
+		totalwords = $5;
+		totaloovs = $7;
+	} else {
+		totalsents -= $3;
+		totalwords -= $5;
+		totaloovs -= $7;
+	}
+
+	getline;
+
+	if (ARGIND == 1) {
+		zeroprobs = $1;
+		totalprob = $4;
+	} else {
+		zeroprobs -= $1;
+		totalprob -= $4;
+	}
+}
+END {
+	M_LN10 = 2.30258509299404568402;        # from <math.h>
+
+	ppl = exp (- M_LN10 * totalprob / \
+			(totalwords - totaloovs - zeroprobs + totalsents));
+
+	printf "file TOTAL: %d sentences, %d words, %d OOVs\n", \
+			totalsents, totalwords, totaloovs;
+	printf "%d zeroprobs, logprob= %g ppl= %g\n", \
+			zeroprobs, totalprob, ppl;
+}
--- a/language_model/srilm-1.7.3/utils/src/tolower-ngram-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/tolower-ngram-counts.gawk
@@ -0,0 +1,13 @@
+#!/usr/local/bin/gawk -f
+#
+# tolower-ngram-counts --
+#	Map N-gram counts to lowercase
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/tolower-ngram-counts.gawk,v 1.1 2007/07/13 23:38:22 stolcke Exp $
+#
+{
+	for (i = 1; i < NF; i ++) {
+		$i = tolower($i);
+	}
+	print;
+}
--- a/language_model/srilm-1.7.3/utils/src/uniform-classes.gawk
+++ b/language_model/srilm-1.7.3/utils/src/uniform-classes.gawk
@@ -0,0 +1,65 @@
+#!/usr/local/bin/gawk -f
+#
+# uniform-classes --
+#	Assign uniform membership probabilities to word class expansions
+# 	that don't already have probabilities
+#
+# usage: uniform-clases CLASSFILE > UNIFORM-CLASSFILE
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/uniform-classes.gawk,v 1.3 2016/05/13 23:00:35 stolcke Exp $
+#
+
+BEGIN {
+    num_class_defs = 0;
+}
+
+{
+    line = $0;
+
+    n = split(line, a);
+    if (n == 0) next;
+
+    class = a[1];
+    num_exp = ++ num_class_expansions[class];
+
+    if (a[2] ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
+	    prob = a[2];
+	    i = 3;
+    } else {
+	    prob = "";
+	    i = 2;
+    }
+    
+    expansion = a[i];
+    for (i++; i <= n; i++) {
+	expansion = expansion " " a[i];
+    }
+
+    class_expansions[class " " num_exp] = expansion;
+    if (prob != "") {
+	class_expansion_probs[class " " num_exp] = prob;
+    }
+    num_class_defs ++;
+}
+
+END {
+    print "read " num_class_defs " class expansions" >> "/dev/stderr";
+
+    # assign default expansion probs
+
+    for (class in num_class_expansions) {
+
+	num_exp =  num_class_expansions[class];
+
+	for (i = 1; i <= num_exp; i ++) {
+	    prob = class_expansion_probs[class " " i];
+
+	    if (prob == "") {
+		prob = 1/num_exp;
+	    }
+
+	    print class, prob, class_expansions[class " " i];
+	}
+    }
+}
+
--- a/language_model/srilm-1.7.3/utils/src/uniq-ngram-counts.gawk
+++ b/language_model/srilm-1.7.3/utils/src/uniq-ngram-counts.gawk
@@ -0,0 +1,36 @@
+#!/usr/local/bin/gawk -f
+#
+# uniq-ngram-counts --
+#	Collapse identical successive N-grams in counts file
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/uniq-ngram-counts.gawk,v 1.2 2007/07/13 23:50:28 stolcke Exp $
+#
+{
+	if (NF == 1) {
+	    ngram = " ";
+	} else {
+	    ngram = "";
+	}
+
+	for (i = 1; i < NF; i ++) {
+		ngram = ngram " " $i;
+	}
+
+	# starting ngrams with space character forces string comparison
+	if (ngram != last_ngram) {
+	    if (last_ngram != "") {
+		# avoid outputting initial space
+		print substr(last_ngram, 2), total_count;
+	    }
+	    total_count = 0;
+	    last_ngram = ngram;
+	}
+
+	total_count += $NF;
+}
+
+END {
+	if (last_ngram != "") {
+		print substr(last_ngram, 2), total_count;
+	}
+}
--- a/language_model/srilm-1.7.3/utils/src/vp2text.gawk
+++ b/language_model/srilm-1.7.3/utils/src/vp2text.gawk
@@ -0,0 +1,79 @@
+#!/usr/local/bin/gawk -f
+#
+# vp2text --
+#	Convert the ARPA CSR vp (verbalized punctiation) format to plain
+#	text for LM training.
+#
+# 	This combines the functionality of Roni Rosenfeld's "vp2svp1" and
+#	"sgml2text" utilities (except for case mapping).  No <s> and </s>
+#	tags are retained, since our LM software doesn't need them.
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/vp2text.gawk,v 1.2 1996/09/17 21:59:57 stolcke Exp $
+#
+
+BEGIN {
+	iquote = 0;
+	nquote = 5;
+}
+# Reset the quote counter at article boundaries
+/^<art\./ {
+	iquote = 0;
+}
+/^<DOC/ {
+	iquote = 0;
+}
+#
+# Filter out SGML tags
+#
+/^</ {
+	next;
+}
+#
+# Do all the easy replacements 
+{
+	# These are pronounced
+	gsub("@AT-SIGN", "at");
+	gsub("&AMPERSAND", "and");
+	gsub("\\+PLUS", "plus");
+	gsub("=EQUALS", "equals");
+	gsub("%PERCENT", "percent");
+	gsub("/SLASH", "slash");
+	gsub("\\.POINT", "point");
+
+	# These aren't
+	gsub(",COMMA", "");
+	gsub("\\?QUESTION-MARK", "");
+	gsub(":COLON", "");
+	gsub("\#SHARP-SIGN", "");
+	gsub("'SINGLE-QUOTE", "");
+	gsub(";SEMI-COLON", "");
+	gsub("!EXCLAMATION-POINT", "");
+	gsub("{LEFT-BRACE", "");
+	gsub("}RIGHT-BRACE", "");
+	gsub("\\(LEFT-PAREN", "");
+	gsub("\\)RIGHT-PAREN", "");
+	gsub("\\.PERIOD", "");
+	gsub("\\.\\.\\.ELLIPSIS", "");
+	gsub("--DASH", "");
+	gsub("-HYPHEN", "");
+}
+# Handle lines containing "DOUBLE-QUOTE as a special case since this
+# is more costly: replace every nquote'th occurrence with "quote", else
+# delete it.
+/"DOUBLE-QUOTE/ {
+	output = "";
+	for (i = 1; i <= NF; i++) {
+		if ($i == "\"DOUBLE-QUOTE") {
+			if ((iquote++) % nquote == 0) {
+				output = output " quote";
+			}
+		} else {
+			output = output " " $i;
+		}
+	}
+	print output;
+	next;
+}
+{
+	print;
+}
--- a/language_model/srilm-1.7.3/utils/src/wlat-stats.gawk
+++ b/language_model/srilm-1.7.3/utils/src/wlat-stats.gawk
@@ -0,0 +1,138 @@
+#!/usr/local/bin/gawk -f
+#
+# wlat-stats --
+#	Compute statistics of word posterior lattices
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/wlat-stats.gawk,v 1.6 2019/07/24 16:16:55 stolcke Exp $
+#
+BEGIN {
+	name = "";
+	nhyps = 0;
+	entropy = 0;
+	nwords = 0;
+	ewords = 0;			# posterior expected words
+
+	nsub = nins = ndel = 0;		# 1best error counts
+	min_errs = 0;			# oracle error count
+
+	M_LN10 = 2.30258509299404568402;
+
+	empty_hyp = "*DELETE*";
+
+	total_posterior = 1;
+}
+
+$1 == "name" {
+	name = $2;
+	next;
+}
+
+$1 == "posterior" {
+	total_posterior = $2;
+	next;
+}
+
+#
+# word lattice format:
+#	node 46 them 11 0.011827 45 0.0111445 13 0.000682478 ...
+#
+$1 == "node" {
+	word = $3;
+	posterior = $5;
+
+	if (word != "NULL") {
+	    nhyps ++;
+	}
+
+	if (posterior > 0) {
+	    for (i = 6; i <= NF; i += 2) {
+		prob = $(i + 1);
+
+		if (prob > 0) {
+		    entropy -= prob * log(prob/posterior);
+		    if (word != "NULL") {
+			ewords += prob;
+		    }
+		}
+	    }
+	}
+}
+
+#
+# confusion network format:
+#	align 4 okay 0.998848 ok 0.00113834 i 1.06794e-08 a 4.48887e-08 ...
+#
+$1 == "align" {
+	align_pos = $2;
+
+	best_hyp = "";
+	best_posterior = 0;
+	delete all_hyps;
+	for (i = 3; i <= NF; i += 2) {
+	    word = $i;
+
+	    if (word != "*DELETE*") {
+		nhyps ++;
+	    }
+
+	    prob = $(i + 1);
+	    if (prob > 0) {
+		entropy -= prob/total_posterior * log(prob/total_posterior);
+		all_hyps[word] = 1;
+
+		if (word != "*DELETE*") {
+		    ewords += prob/total_posterior;
+		}
+	    }
+
+	    if (prob > best_posterior) {
+		best_posterior = prob;
+		best_hyp = word;
+	    }
+	}
+}
+$1 == "reference" && $2 == align_pos {
+	if ($3 != empty_hyp) {
+	    nwords ++;
+
+	    if (best_hyp == empty_hyp) {
+		ndel ++;
+	    } else if (best_hyp != $3) {
+		nsub ++;
+	    }
+	} else {
+	    if (best_hyp != empty_hyp) {
+		nins ++;
+	    }
+	}
+
+	# update oracle error
+	if (!($3 in all_hyps)) {
+	    min_errs ++;
+	}
+
+	align_pos = -1;
+}
+
+END {
+	printf name (name != "" ? " " : "") \
+	       nhyps " hypotheses " \
+	       entropy/M_LN10 " entropy " \
+	       ewords " ewords";
+	if (nwords > 0) {
+	    printf " " nwords " words " nhyps/nwords " hyps/word " \
+		  entropy/M_LN10/nwords " entropy/word";
+	}
+	printf "\n";
+	if (nwords  > 0) {
+	    nerrors = nsub + nins + ndel;
+	    printf name (name != "" ? " " : "") \
+		   nerrors " errors " nerrors*100/nwords " WER " \
+		   nsub*100/nwords " SUB " nins*100/nwords " INS " \
+		   ndel*100/nwords " DEL\n";
+
+	    printf name (name != "" ? " " : "") \
+		   min_errs " minerrors " min_errs*100/nwords " minWER\n";
+	}
+}
+
--- a/language_model/srilm-1.7.3/utils/src/wlat-to-dot.gawk
+++ b/language_model/srilm-1.7.3/utils/src/wlat-to-dot.gawk
@@ -0,0 +1,105 @@
+#!/usr/local/bin/gawk -f
+#
+# wlat-to-dot --
+#	Generate dot(1) graph description from word lattice generates by
+#	nbest-lattice(1)
+#
+# usage: wlat-to-dot [show_probs=1] file.wlat > file.dot
+#
+# $Header: /home/srilm/CVS/srilm/utils/src/wlat-to-dot.gawk,v 1.6 2004/11/02 02:00:35 stolcke Exp $
+#
+BEGIN {
+	name = "WLAT"; 
+	show_probs = 0;
+	show_nums = 0;
+
+	version = 1;
+}
+$1 == "name" {
+	name = $2;
+}
+
+#
+# nbest-lattice output (without -use-mesh)
+#
+$1 == "initial" {
+	print "digraph \"" name "\" {";
+	print "rankdir = LR";
+
+	i = $2;
+}
+$1 == "final" {
+	i = $2;
+}
+$1 == "version" {
+	version = $2;
+}
+$1 == "node" && version == 1 {
+	from = $2;
+	word = $3;
+	post = $4;
+
+	print "\tnode" from " [label=\"" word \
+		(!show_nums ? "" : ("/" from)) \
+		(!show_probs ? "" : "\\n" post ) "\"]";
+
+	for (i = 5; i <= NF; i ++) {
+	    to = $i;
+	    print "\tnode" from " -> node" to ";"
+	}
+}
+$1 == "node" && version == 2 {
+	from = $2;
+	word = $3;
+	align = $4;
+	post = $5;
+
+	print "\tnode" from " [label=\"" word \
+		(!show_nums ? "" : ("/" from)) \
+		"\\n" align \
+		(!show_probs ? "" : "/" post ) "\"]";
+
+	for (i = 6; i <= NF; i += 2) {
+	    to = $i;
+	    print "\tnode" from " -> node" to \
+		(!show_probs ? "" : " [label=\"" $(i + 1) "\"]") ";"
+	}
+}
+
+#
+# nbest-lattice -use-mesh output (confusion networks)
+#
+
+$1 == "numaligns" {
+	print "digraph \"" name "\" {";
+	print "rankdir = LR";
+
+	print "node0 [label=\"" (show_nums ? 0 : "") "\"]";
+}
+
+$1 == "align" {
+
+	pos = $2;
+
+	for (i = 3; i <= NF; i += 2) {
+		word = $i;
+		posterior = $(i + 1);
+
+		if (posterior == 0) {
+		    print "align " pos ", word " word \
+				": zero posterior, omitting it" >> "/dev/stderr";
+		    continue;
+		}
+
+		print "node" pos " -> node" (pos + 1) \
+			" [label=\"" word \
+			(show_probs ? ("\\n" posterior) : "") \
+			"\"]";
+	}
+	print "node" (pos + 1) " [label=\"" (show_nums ? (pos + 1) : "") "\"]";
+}
+
+END {
+	print "}"
+}
+
--- a/Show More
+++ b/Show More