competition update
This commit is contained in:
0
language_model/srilm-1.7.3/utils/doc/.keepme
Normal file
0
language_model/srilm-1.7.3/utils/doc/.keepme
Normal file
0
language_model/srilm-1.7.3/utils/obj/i686-m64/STAMP
Normal file
0
language_model/srilm-1.7.3/utils/obj/i686-m64/STAMP
Normal file
226
language_model/srilm-1.7.3/utils/src/Makefile
Normal file
226
language_model/srilm-1.7.3/utils/src/Makefile
Normal file
@@ -0,0 +1,226 @@
|
||||
#
|
||||
# File: Makefile.example
|
||||
# Author: The SRI DECIPHER (TM) System
|
||||
# Date: Thu Sep 9 12:04:47 1993
|
||||
#
|
||||
# Description:
|
||||
# This is the example makefile to start from when adding new
|
||||
# modules to the DECIPHER System. To use this makefile, first
|
||||
# copy it to your directory as the file "Makefile". Second,
|
||||
# replace the word "Example" in the text below with the real name
|
||||
# of your library. Next replace the the example filenames with
|
||||
# the names of your actual declarations and source files in the
|
||||
# appropriate variable definitions. Finally clean up by deleting
|
||||
# any lines not relevant to your module and updating this header
|
||||
# to describe your new module. Do not forget to use the proper
|
||||
# RCS keywords!
|
||||
#
|
||||
# Copyright (c) 1993, SRI International. All Rights Reserved.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/Makefile,v 1.76 2019/02/09 07:36:09 stolcke Exp $
|
||||
#
|
||||
|
||||
# Include common SRILM variable definitions.
|
||||
include $(SRILM)/common/Makefile.common.variables
|
||||
|
||||
# This should enable locale-specific string collation for vocabulary sorting
|
||||
# (it will slow things down somewhat).
|
||||
#ADDITIONAL_CXXFLAGS = -Dstrcmp=strcoll
|
||||
|
||||
# Flags for generating "compact" data structures
|
||||
COMPACT_FLAGS += -DUSE_SARRAY -DUSE_SARRAY_TRIE -DUSE_SARRAY_MAP2
|
||||
|
||||
# Flags for generating "short" data structures
|
||||
SHORT_FLAGS = $(COMPACT_FLAGS) -DUSE_SHORT_VOCAB -DUSE_XCOUNTS
|
||||
|
||||
# Flags for generating "long long" data structures
|
||||
LLONG_FLAGS = $(COMPACT_FLAGS) -DUSE_LONGLONG_COUNTS -DUSE_XCOUNTS
|
||||
|
||||
# enable use of liblbfgs if indicated
|
||||
ifneq ($(HAVE_LIBLBFGS), )
|
||||
ADDITIONAL_CFLAGS += -DHAVE_LIBLBFGS
|
||||
ADDITIONAL_CXXFLAGS += -DHAVE_LIBLBFGS
|
||||
endif
|
||||
|
||||
ADDITIONAL_LDFLAGS += \
|
||||
$(MATHERR_LINK)
|
||||
|
||||
ADDITIONAL_LIBRARIES += \
|
||||
$(SRILM_LIBDIR)/$(LIB_PREFIX)oolm$(LIB_SUFFIX) \
|
||||
$(SRILM_LIBDIR)/$(LIB_PREFIX)dstruct$(LIB_SUFFIX) \
|
||||
$(SRILM_LIBDIR)/$(LIB_PREFIX)misc$(LIB_SUFFIX) \
|
||||
$(SRILM_LIBDIR)/$(LIB_PREFIX)z$(LIB_SUFFIX) \
|
||||
$(MATH_LIBRARY) \
|
||||
$(LBFGS_LIBRARY)
|
||||
|
||||
# Exported programs.
|
||||
REAL_PROGRAM_NAMES = \
|
||||
nbest-rover-helper
|
||||
|
||||
# Example programs.
|
||||
PROGRAM_NAMES = $(REAL_PROGRAM_NAMES)
|
||||
|
||||
PROGRAMS = $(PROGRAM_NAMES:%=$(BINDIR)/%$(EXE_SUFFIX))
|
||||
|
||||
PROGRAM_SOURCES = $(foreach prog,$(PROGRAM_NAMES),\
|
||||
$(wildcard $(SRCDIR)/$(prog).c) \
|
||||
$(wildcard $(SRCDIR)/$(prog).cc))
|
||||
PROGRAM_OBJECTS = $(PROGRAM_NAMES:%=$(OBJDIR)/%$(OBJ_SUFFIX))
|
||||
|
||||
# Libraries to be linked with the Example programs.
|
||||
LIBRARIES = $(LIBRARY) \
|
||||
$(ADDITIONAL_LIBRARIES)
|
||||
|
||||
# All of the types of files.
|
||||
|
||||
ALL_SOURCES = $(PROGRAM_SOURCES)
|
||||
|
||||
ALL_OBJECTS = $(PROGRAM_OBJECTS)
|
||||
|
||||
ALL_PROGRAMS = $(PROGRAMS)
|
||||
|
||||
ALL_PROGRAM_NAMES = $(PROGRAM_NAMES)
|
||||
|
||||
#
|
||||
|
||||
SCRIPTS = \
|
||||
rescore-nbest \
|
||||
wordlat-to-lisp \
|
||||
extract-skip-probs \
|
||||
$(EXPORTED_SCRIPTS)
|
||||
|
||||
EXPORTED_SCRIPTS = \
|
||||
change-lm-vocab \
|
||||
empty-sentence-lm \
|
||||
rescore-decipher \
|
||||
rescore-acoustic \
|
||||
rescore-reweight \
|
||||
rescore-minimize-wer \
|
||||
make-batch-counts \
|
||||
merge-batch-counts \
|
||||
make-big-lm \
|
||||
make-multiword-pfsg \
|
||||
pfsg-from-ngram \
|
||||
nbest-error \
|
||||
nbest-rover \
|
||||
search-rover-combo \
|
||||
rexport.gnumake \
|
||||
align-with-tags \
|
||||
compute-sclite \
|
||||
compute-sclite-nbest \
|
||||
compare-sclite \
|
||||
cumbin
|
||||
|
||||
# scripts that need to be edited before installation
|
||||
EDIT_SCRIPTS = \
|
||||
add-classes-to-pfsg \
|
||||
add-dummy-bows \
|
||||
add-pauses-to-pfsg \
|
||||
add-ppls \
|
||||
bytelog-to-log10 \
|
||||
classes-to-fsm \
|
||||
combine-acoustic-scores \
|
||||
combine-rover-controls \
|
||||
rover-control-weights \
|
||||
rover-control-tying \
|
||||
compare-ppls \
|
||||
compute-best-mix \
|
||||
compute-best-rover-mix \
|
||||
compute-best-sentence-mix \
|
||||
compute-oov-rate \
|
||||
concat-sausages \
|
||||
context-ngrams \
|
||||
continuous-ngram-count \
|
||||
de-vq-lm \
|
||||
extract-skip-probs \
|
||||
filter-event-counts \
|
||||
find-reference-posteriors \
|
||||
fix-ctm \
|
||||
fsm-to-pfsg \
|
||||
get-gt-counts \
|
||||
get-unigram-probs \
|
||||
hits-from-log \
|
||||
log10-to-bytelog \
|
||||
make-abs-discount \
|
||||
make-diacritic-map \
|
||||
make-google-ngrams \
|
||||
make-gt-discounts \
|
||||
make-kn-discounts \
|
||||
make-kn-counts \
|
||||
make-hiddens-lm \
|
||||
make-lm-subset \
|
||||
make-nbest-pfsg \
|
||||
make-ngram-pfsg \
|
||||
make-sub-lm \
|
||||
metadb \
|
||||
sort-lm \
|
||||
reverse-lm \
|
||||
merge-nbest \
|
||||
nbest-posteriors \
|
||||
nbest2-to-nbest1 \
|
||||
nbest-optimize-args-from-rover-control \
|
||||
nbest-oov-counts \
|
||||
nbest-vocab \
|
||||
nbest-words \
|
||||
pfsg-to-dot \
|
||||
pfsg-to-fsm \
|
||||
pfsg-vocab \
|
||||
htklat-vocab \
|
||||
ppl-from-log \
|
||||
remove-lowprob-ngrams \
|
||||
replace-unk-words \
|
||||
replace-words-with-classes \
|
||||
reverse-text \
|
||||
reverse-ngram-counts \
|
||||
sentid-to-sclite \
|
||||
sentid-to-ctm \
|
||||
split-tagged-ngrams \
|
||||
subset-context-ngrams \
|
||||
subtract-ppls \
|
||||
tolower-ngram-counts \
|
||||
uniform-classes \
|
||||
uniq-ngram-counts \
|
||||
vp2text \
|
||||
wlat-to-dot \
|
||||
wlat-to-pfsg \
|
||||
wlat-stats \
|
||||
wordlat-to-lisp \
|
||||
prettify \
|
||||
select-vocab
|
||||
|
||||
|
||||
# Define targets.
|
||||
|
||||
all: $(PROGRAMS)
|
||||
|
||||
$(LIBRARY): $(LIB_OBJECTS)
|
||||
$(ARCHIVE) $(AR_OUTPUT_OPTION) $^ $(DEMANGLE_FILTER)
|
||||
$(RANLIB) $@ $(DEMANGLE_FILTER)
|
||||
|
||||
$(PROGRAMS): $(LIBRARY) $(OTHER_LIBRARIES)
|
||||
|
||||
# Variables and Targets for released system
|
||||
|
||||
EXPORTED_PROGRAMS = \
|
||||
$(EDIT_SCRIPTS:%=$(BINDIR)/%) \
|
||||
$(REAL_PROGRAM_NAMES:%=$(BINDIR)/%$(EXE_SUFFIX))
|
||||
|
||||
release: release-scripts release-programs
|
||||
|
||||
# Include common SRILM target definitions.
|
||||
include $(SRILM)/common/Makefile.common.targets
|
||||
|
||||
#
|
||||
# Rule to create edited gawk script
|
||||
#
|
||||
$(BINDIR)/%: $(SRCDIR)/%.gawk $(BINDIR_STAMP) $(SRILM)/common/Makefile.machine.$(MACHINE_TYPE)
|
||||
sed -e '1s,/usr/local/bin/gawk,$(GAWK),' $< >$@.new
|
||||
mv $@.new $@
|
||||
|
||||
#
|
||||
# Rule to create edited perl script
|
||||
#
|
||||
$(BINDIR)/%: $(SRCDIR)/%.pl $(BINDIR_STAMP) $(SRILM)/common/Makefile.machine.$(MACHINE_TYPE)
|
||||
sed -e '1s,/usr/local/bin/perl,$(PERL),' $< >$@.new
|
||||
mv $@.new $@
|
||||
|
||||
172
language_model/srilm-1.7.3/utils/src/add-classes-to-pfsg.gawk
Executable file
172
language_model/srilm-1.7.3/utils/src/add-classes-to-pfsg.gawk
Executable file
@@ -0,0 +1,172 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# add-classes-to-pfsg --
|
||||
# Modify Decipher PFSG by expanding class nodes with words
|
||||
#
|
||||
# usage: add-classes-to-pfsg classes=<expansions> pfsg > expanded-pfsg
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/add-classes-to-pfsg.gawk,v 1.5 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
|
||||
function read_classes(file) {
|
||||
|
||||
num_class_defs = 0;
|
||||
delete num_class_expansions;
|
||||
delete class_expansions;
|
||||
delete class_expansion_probs;
|
||||
|
||||
while ((getline line < file) > 0) {
|
||||
|
||||
n = split(line, a);
|
||||
if (n == 0) continue;
|
||||
|
||||
class = a[1];
|
||||
num_exp = ++ num_class_expansions[class];
|
||||
|
||||
if (a[2] ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
|
||||
prob = a[2];
|
||||
i = 3;
|
||||
} else {
|
||||
prob = "";
|
||||
i = 2;
|
||||
}
|
||||
|
||||
expansion = a[i];
|
||||
for (i++; i <= n; i++) {
|
||||
expansion = expansion " " a[i];
|
||||
}
|
||||
|
||||
class_expansions[class " " num_exp] = expansion;
|
||||
if (prob != "") {
|
||||
class_expansion_probs[class " " num_exp] = prob;
|
||||
}
|
||||
num_class_defs ++;
|
||||
}
|
||||
|
||||
print "read " num_class_defs " class expansions" >> "/dev/stderr";
|
||||
|
||||
# assign default expansion probs
|
||||
|
||||
for (class in num_class_expansions) {
|
||||
|
||||
num_exp = num_class_expansions[class];
|
||||
|
||||
for (i = 1; i <= num_exp; i ++) {
|
||||
if (class_expansion_probs[class " " i] == "") {
|
||||
class_expansion_probs[class " " i] = 1/num_exp;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
######################################################################
|
||||
|
||||
BEGIN {
|
||||
logscale = 10000.5;
|
||||
round = 0.5;
|
||||
|
||||
null = "NULL";
|
||||
|
||||
classes_toupper = 1; # map class names to upper case
|
||||
}
|
||||
|
||||
function rint(x) {
|
||||
if (x < 0) {
|
||||
return int(x - round);
|
||||
} else {
|
||||
return int(x + round);
|
||||
}
|
||||
}
|
||||
|
||||
function scale_prob(x) {
|
||||
return rint(log(x) * logscale);
|
||||
}
|
||||
|
||||
function print_class_pfsg(class) {
|
||||
print "name " (classes_toupper ? toupper(class) : class);
|
||||
|
||||
# compute total number of nodes needed
|
||||
num_exp = num_class_expansions[class];
|
||||
num_words = 0;
|
||||
all_words = "";
|
||||
for (i = 1; i <= num_exp; i ++) {
|
||||
num_words += split(class_expansions[class " " i], a);
|
||||
all_words = all_words " " class_expansions[class " " i];
|
||||
}
|
||||
|
||||
print "nodes " (num_words + 2) " " null " " null all_words;
|
||||
|
||||
initial = 0;
|
||||
final = 1;
|
||||
print "initial " initial;
|
||||
print "final " final;
|
||||
|
||||
print "transitions " (num_words + num_exp);
|
||||
|
||||
node_index = final;
|
||||
|
||||
for (i = 1; i <= num_exp; i ++) {
|
||||
n = split(class_expansions[class " " i], a);
|
||||
if (n == 0) {
|
||||
print initial, final, \
|
||||
scale_prob(class_expansion_probs[class " " i]);
|
||||
} else {
|
||||
print initial, ++node_index, \
|
||||
scale_prob(class_expansion_probs[class " " i]);
|
||||
|
||||
for (k = 2; k <= n; k ++) {
|
||||
print node_index, node_index + 1, 0;
|
||||
node_index ++;
|
||||
}
|
||||
|
||||
print node_index, final, 0;
|
||||
}
|
||||
}
|
||||
|
||||
print "";
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
if (classes) {
|
||||
read_classes(classes);
|
||||
}
|
||||
close(classes);
|
||||
}
|
||||
|
||||
# record class names used in PFSGs
|
||||
$1 == "nodes" {
|
||||
for (i = 3; i <= NF; i ++) {
|
||||
if ($i != null && $i in num_class_expansions) {
|
||||
class_used[$i] = 1;
|
||||
if (classes_toupper) {
|
||||
upper_class = toupper($i);
|
||||
|
||||
if ($i != upper_class && upper_class in num_class_expansions) {
|
||||
print "cannot map class " $i \
|
||||
" to uppercase due to name conflict" >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
$i = upper_class;
|
||||
}
|
||||
}
|
||||
}
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
# pass old PFSGs through unchanged
|
||||
{
|
||||
print;
|
||||
}
|
||||
|
||||
# dump out class PFSGs
|
||||
END {
|
||||
print "";
|
||||
|
||||
for (class in class_used) {
|
||||
print_class_pfsg(class);
|
||||
}
|
||||
}
|
||||
|
||||
35
language_model/srilm-1.7.3/utils/src/add-dummy-bows.gawk
Executable file
35
language_model/srilm-1.7.3/utils/src/add-dummy-bows.gawk
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# add-dummy-bows --
|
||||
# add redundant backoff weights to model file to make some broken
|
||||
# programs happy.
|
||||
# (Normally a backoff weight is only required for ngrams that
|
||||
# are prefixes of longer ngrams.)
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/add-dummy-bows.gawk,v 1.1 1995/09/20 17:36:30 stolcke Exp $
|
||||
#
|
||||
|
||||
NF==0 {
|
||||
print; next;
|
||||
}
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
if (order > highorder) highorder = order;
|
||||
print;
|
||||
next;
|
||||
}
|
||||
/^.[0-9]-grams:/ {
|
||||
currorder=substr($0,2,1);
|
||||
}
|
||||
/^\\/ {
|
||||
print; next;
|
||||
}
|
||||
currorder && currorder < highorder {
|
||||
if (NF < currorder + 2) {
|
||||
print $0 "\t0";
|
||||
} else {
|
||||
print;
|
||||
}
|
||||
next;
|
||||
}
|
||||
{ print }
|
||||
171
language_model/srilm-1.7.3/utils/src/add-pauses-to-pfsg.gawk
Executable file
171
language_model/srilm-1.7.3/utils/src/add-pauses-to-pfsg.gawk
Executable file
@@ -0,0 +1,171 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# add-pauses-to-pfsg --
|
||||
# Modify Decipher PFSG to allow pauses between words
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/add-pauses-to-pfsg.gawk,v 1.15 2015-07-03 03:44:52 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
pause = "-pau-";
|
||||
top_level_name = "TOP_LEVEL";
|
||||
pause_filler_name = "PAUSE_FILLER";
|
||||
null = "NULL";
|
||||
|
||||
wordwrap = 1; # wrap pause filler around words
|
||||
pauselast = 0; # make pauses follow wrapped words
|
||||
version = 0; # no "version" line by default
|
||||
}
|
||||
|
||||
#
|
||||
# output the TOP_LEVEL model
|
||||
# oldname is the name of the original pfsg
|
||||
function print_top_level(oldname) {
|
||||
if (version) {
|
||||
print "version " version "\n";
|
||||
}
|
||||
print "name " top_level_name;
|
||||
if (pauselast) {
|
||||
print "nodes 4 " null " " pause_filler_name " " oldname " " null;
|
||||
} else {
|
||||
print "nodes 4 " null " " oldname " " pause_filler_name " " null;
|
||||
}
|
||||
print "initial 0"
|
||||
print "final 3"
|
||||
print "transitions 4"
|
||||
print "0 1 0"
|
||||
print "1 2 0"
|
||||
if (pauselast) {
|
||||
print "0 2 0"
|
||||
} else {
|
||||
print "1 3 0"
|
||||
}
|
||||
print "2 3 0"
|
||||
print "";
|
||||
}
|
||||
|
||||
function word_wrapper_name(word) {
|
||||
return "_" word "_PF";
|
||||
}
|
||||
|
||||
#
|
||||
# output a pause wrapper for word
|
||||
#
|
||||
function print_word_wrapper(word) {
|
||||
print "name " word_wrapper_name(word);
|
||||
if (pauselast) {
|
||||
print "nodes 3 " word " " pause_filler_name " " null;
|
||||
} else {
|
||||
print "nodes 3 " null " " pause_filler_name " " word;
|
||||
}
|
||||
print "initial 0";
|
||||
print "final 2";
|
||||
print "transitions 3";
|
||||
print "0 1 0";
|
||||
print "1 2 0";
|
||||
print "0 2 0";
|
||||
print "";
|
||||
}
|
||||
|
||||
#
|
||||
# output the pause filler
|
||||
#
|
||||
function print_pause_filler() {
|
||||
print "name " pause_filler_name;
|
||||
print "nodes 3 " null " " pause " " null;
|
||||
print "initial 0";
|
||||
print "final 2";
|
||||
print "transitions 3";
|
||||
print "0 1 0";
|
||||
print "1 1 0";
|
||||
print "1 2 0";
|
||||
}
|
||||
|
||||
NF == 0 {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# read vocabulary list if supplied
|
||||
#
|
||||
NR == 1 && vocab != "" {
|
||||
while (getline line < vocab) {
|
||||
if (split(line, a)) {
|
||||
word_list[a[1]] = 1;
|
||||
}
|
||||
}
|
||||
close (vocab);
|
||||
}
|
||||
|
||||
#
|
||||
# check that a node name is word
|
||||
# if a vocabulary was not specified we use the following heuristic:
|
||||
# word nodes contain at least one lowercase or non-ascii character and are not
|
||||
# surrounded by "*...*" (which indicates a class name).
|
||||
#
|
||||
function is_word(w) {
|
||||
if (vocab) {
|
||||
return w in word_list;
|
||||
} else {
|
||||
return !is_classname(w);
|
||||
}
|
||||
}
|
||||
|
||||
function is_classname(w) {
|
||||
return w ~ /^\*.*\*$/ || !(w ~ /[[:lower:]]/ || w ~ /[^\x00-\x7F]/);
|
||||
}
|
||||
|
||||
#
|
||||
# first time we see a pfsg name, issue a top-level wrapper for it.
|
||||
#
|
||||
$1 == "name" && !have_top_level {
|
||||
print_top_level($2);
|
||||
print;
|
||||
have_top_level = 1;
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# maps word nodes to wrapper nodes
|
||||
#
|
||||
$1 == "nodes" {
|
||||
numnodes = $2;
|
||||
printf "nodes %d", numnodes;
|
||||
|
||||
for (i = 0; i < numnodes; i ++) {
|
||||
node_name = $(i + 3);
|
||||
|
||||
# if it contains lowercase characters it's a word and
|
||||
# needs to wrapped
|
||||
if (wordwrap && is_word(node_name) && \
|
||||
node_name != pause && node_name != null)
|
||||
{
|
||||
if (!(node_name in all_words)) {
|
||||
all_words[node_name] = 1;
|
||||
words[++num_words] = node_name;
|
||||
}
|
||||
printf " %s", word_wrapper_name(node_name);
|
||||
} else {
|
||||
printf " %s", node_name;
|
||||
}
|
||||
}
|
||||
printf "\n";
|
||||
next;
|
||||
}
|
||||
|
||||
{
|
||||
print;
|
||||
}
|
||||
|
||||
END {
|
||||
#
|
||||
# output the word wrappers
|
||||
#
|
||||
if (wordwrap) {
|
||||
for (i = 1; i <= num_words; i ++) {
|
||||
print_word_wrapper(words[i]);
|
||||
}
|
||||
}
|
||||
|
||||
print_pause_filler();
|
||||
}
|
||||
30
language_model/srilm-1.7.3/utils/src/add-ppls.gawk
Executable file
30
language_model/srilm-1.7.3/utils/src/add-ppls.gawk
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# add-ppls --
|
||||
# Add text statistics (from -ppl output)
|
||||
#
|
||||
# Copyright (c) 1995,1997 SRI International. All Rights Reserved
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/add-ppls.gawk,v 1.2 1997/07/12 05:01:08 stolcke Exp $
|
||||
#
|
||||
/^file .*: .* sentences/ {
|
||||
totalsents += $3;
|
||||
totalwords += $5;
|
||||
totaloovs += $7;
|
||||
|
||||
getline;
|
||||
|
||||
zeroprobs += $1;
|
||||
totalprob += $4;
|
||||
}
|
||||
END {
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
|
||||
ppl = exp (- M_LN10 * totalprob / \
|
||||
(totalwords - totaloovs - zeroprobs + totalsents));
|
||||
|
||||
printf "file TOTAL: %d sentences, %d words, %d OOVs\n", \
|
||||
totalsents, totalwords, totaloovs;
|
||||
printf "%d zeroprobs, logprob= %g ppl= %g\n", \
|
||||
zeroprobs, totalprob, ppl;
|
||||
}
|
||||
194
language_model/srilm-1.7.3/utils/src/align-with-tags
Executable file
194
language_model/srilm-1.7.3/utils/src/align-with-tags
Executable file
@@ -0,0 +1,194 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# align-with-tags --
|
||||
# align reference transcript with tags to hypothesized
|
||||
# transcripts, merging the tags into the latter
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/align-with-tags,v 1.7 2015-07-03 03:45:38 stolcke Exp $
|
||||
#
|
||||
|
||||
usage () {
|
||||
echo "usage: $0 [-r ref -h hyp] [-dictionary D] [-aligndir A] [-options...]" >&2
|
||||
exit 2;
|
||||
}
|
||||
|
||||
ref=/dev/null
|
||||
hyp=/dev/null
|
||||
dictionary=/dev/null
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-r) ref="$2"
|
||||
shift; shift;;
|
||||
-h) hyp="$2"
|
||||
shift; shift;;
|
||||
-dictionary)
|
||||
dictionary=$2
|
||||
shift; shift;;
|
||||
-aligndir)
|
||||
aligndir=$2
|
||||
shift; shift;;
|
||||
-\?) usage;;
|
||||
-*) pass_options="$pass_option $1"
|
||||
shift;;
|
||||
*) break;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $# -ge 2 ]; then
|
||||
ref="$1"
|
||||
hyp="$2"
|
||||
elif [ $# -gt 0 ]; then
|
||||
usage;
|
||||
fi
|
||||
|
||||
tmpdir=${TMPDIR-/tmp}
|
||||
tmpdict="$tmpdir/dict$$"
|
||||
tmptags="$tmpdir/tags$$"
|
||||
tmprefs="$tmpdir/refs$$"
|
||||
tmphyps="$tmpdir/hyps$$"
|
||||
tmpnbest="$tmpdir/nbest$$"
|
||||
tmpmerge="$tmpdir/merged$$"
|
||||
|
||||
if [ -n "$aligndir" ]; then
|
||||
tmpmerge=
|
||||
fi
|
||||
|
||||
trap "rm -rf $tmpdict $tmptags $tmprefs $tmphyps $tmpnbest $tmpmerge; exit" 0 1 2 15
|
||||
|
||||
if [ -n "$aligndir" ]; then
|
||||
mkdir -p $aligndir
|
||||
tmpmerge=$aligndir
|
||||
fi
|
||||
|
||||
prepare_text () {
|
||||
${GAWK-gawk} -v tag_file=$2 '
|
||||
BEGIN {
|
||||
tag_list["<default>"] = 1;
|
||||
}
|
||||
function is_tag(x) {
|
||||
return (x ~ /^<.*>$/);
|
||||
}
|
||||
{
|
||||
for (i = 2; i <= NF; i ++) {
|
||||
if (is_tag($i)) {
|
||||
tag_list[$i] = 1;
|
||||
} else {
|
||||
$i = tolower($i);
|
||||
}
|
||||
if (!is_tag($(i - 1)) && !is_tag($i)) {
|
||||
$(i - 1) = $(i - 1) " <default>";
|
||||
}
|
||||
}
|
||||
if (!is_tag($NF)) {
|
||||
$NF = $NF " <default>";
|
||||
}
|
||||
print $0;
|
||||
}
|
||||
END {
|
||||
if (tag_file) {
|
||||
for (tag in tag_list) {
|
||||
print tag > tag_file;
|
||||
}
|
||||
}
|
||||
}' $1;
|
||||
}
|
||||
|
||||
parse_alignment () {
|
||||
gzip -d -c -f < $1 | \
|
||||
${GAWK-gawk} -v sentid=$2 'BEGIN {
|
||||
output = sentid;
|
||||
|
||||
show_refs = 1;
|
||||
}
|
||||
|
||||
function is_empty(x) {
|
||||
return x == "<default>" || tolower(x) == "*delete*";
|
||||
}
|
||||
|
||||
function is_tag(x) {
|
||||
return x ~ /^<.*>$/;
|
||||
}
|
||||
|
||||
$1 == "align" {
|
||||
if (NF == 4 && $4 == 1) {
|
||||
# matching hyp and ref
|
||||
if (!is_empty($3)) {
|
||||
output = output " " $3;
|
||||
}
|
||||
} else if (NF == 6 && $4 == 1 && $6 == 0) {
|
||||
# mismatched hyp and ref
|
||||
if (is_empty($3)) {
|
||||
if (is_tag($5)) {
|
||||
if (!is_empty($5)) \
|
||||
output = output " " $5;
|
||||
} else if (showrefs) {
|
||||
output = output " (" $5 ")";
|
||||
}
|
||||
} else {
|
||||
if (is_empty($5) || !showrefs) {
|
||||
output = output " " $3;
|
||||
} else {
|
||||
output = output " " $3 " (" $5 ")";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
print "unexpected alignment: " $0 > "/dev/stderr";
|
||||
}
|
||||
}
|
||||
END {
|
||||
print output;
|
||||
}'
|
||||
}
|
||||
|
||||
set -e
|
||||
|
||||
#
|
||||
# format hyps and refs for alignment
|
||||
#
|
||||
prepare_text $ref $tmptags > $tmprefs
|
||||
prepare_text $hyp > $tmphyps
|
||||
|
||||
#
|
||||
# add tag pronunciations to the dictionary
|
||||
#
|
||||
if [ $dictionary != /dev/null ]; then
|
||||
gzip -d -c -f $dictionary > $tmpdict
|
||||
else
|
||||
> $tmpdict
|
||||
fi
|
||||
${GAWK-gawk} '{ print $1, "**TAG**" }' $tmptags >> $tmpdict
|
||||
|
||||
#
|
||||
# do the alignments
|
||||
#
|
||||
mkdir -p $tmpnbest $tmpmerge
|
||||
|
||||
cat $tmphyps | \
|
||||
while read sentid words
|
||||
do
|
||||
echo "0 0 0 $words" > $tmpnbest/$sentid
|
||||
|
||||
echo $tmpnbest/$sentid
|
||||
done | \
|
||||
nbest-lattice -nbest-files - \
|
||||
-use-mesh \
|
||||
-dictionary $tmpdict \
|
||||
-keep-noise \
|
||||
-refs "$tmprefs" \
|
||||
$pass_options \
|
||||
-write-dir $tmpmerge | \
|
||||
(
|
||||
last_sentid=
|
||||
while read sentid rest
|
||||
do
|
||||
if [ -n "$last_sentid" ]; then
|
||||
parse_alignment $tmpmerge/$last_sentid.gz $last_sentid
|
||||
fi
|
||||
last_sentid=$sentid
|
||||
done
|
||||
if [ -n "$last_sentid" ]; then
|
||||
parse_alignment $tmpmerge/$last_sentid.gz $last_sentid
|
||||
fi
|
||||
)
|
||||
|
||||
19
language_model/srilm-1.7.3/utils/src/bytelog-to-log10.gawk
Executable file
19
language_model/srilm-1.7.3/utils/src/bytelog-to-log10.gawk
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# bytelog-to-log10 --
|
||||
# convert bytelog scores to log-base-10
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/bytelog-to-log10.gawk,v 1.2 2002/05/15 04:47:13 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
logscale = 2.30258509299404568402 * 10000.5 / 1024.0;
|
||||
scale = 1;
|
||||
}
|
||||
{
|
||||
for (i = 1; i <= NF; i ++) {
|
||||
if ($i ~ /^[-+]+[0-9][0-9]*$/) {
|
||||
$i = $i / scale / logscale;
|
||||
}
|
||||
}
|
||||
print;
|
||||
}
|
||||
78
language_model/srilm-1.7.3/utils/src/change-lm-vocab
Executable file
78
language_model/srilm-1.7.3/utils/src/change-lm-vocab
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# change-lm-vocab --
|
||||
# create a language model from an existing one by changing its
|
||||
# vocabulary.
|
||||
# All n-grams in the new vocab are retained with their original
|
||||
# probabilities. Backoff weights are recomputed and backed-off
|
||||
# unigrams for all new words are added.
|
||||
# -subset option performs subsetting of the vocabulary without adding
|
||||
# new words.
|
||||
#
|
||||
# usage: change-lm-vocab [-subset] -vocab vocabfile -lm oldlm -write-lm newlm
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/change-lm-vocab,v 1.9 2013/03/09 07:13:01 stolcke Exp $
|
||||
#
|
||||
|
||||
oldlm=-
|
||||
newlm=-
|
||||
vocab=/dev/null
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-vocab) vocab="$2" ; shift ;;
|
||||
-lm) oldlm="$2" ; shift ;;
|
||||
-write-lm) newlm="$2" ; shift ;;
|
||||
-tolower) options="$options $1" ; tolower=1 ;;
|
||||
-subset) subset=yes ;;
|
||||
*) options="$options $1" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# -subset prevents new words being added to the LM
|
||||
if [ "$subset" ]; then
|
||||
ngram_vocab="/dev/null"
|
||||
else
|
||||
ngram_vocab="$vocab"
|
||||
fi
|
||||
|
||||
gzip -dcf $oldlm | ${GAWK-gawk} '
|
||||
# read the vocab file
|
||||
NR == 1 && vocab {
|
||||
# always include sentence begin/end
|
||||
is_word["<s>"] = is_word["</s>"] = 1;
|
||||
|
||||
while ((getline word < vocab) > 0) {
|
||||
is_word[to_lower ? tolower(word) : word] = 1;
|
||||
}
|
||||
|
||||
close(vocab);
|
||||
}
|
||||
# process old lm
|
||||
NF==0 {
|
||||
print; next;
|
||||
}
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
print;
|
||||
next;
|
||||
}
|
||||
/^\\[0-9]-grams:/ {
|
||||
currorder=substr($0,2,1);
|
||||
print;
|
||||
next;
|
||||
}
|
||||
/^\\/ {
|
||||
print; next;
|
||||
}
|
||||
currorder {
|
||||
for (i = 2 ; i <= currorder + 1; i ++) {
|
||||
if (!((to_lower ? tolower($i) : $i) in is_word)) next;
|
||||
}
|
||||
print;
|
||||
next;
|
||||
}
|
||||
{ print }
|
||||
' vocab=$vocab to_lower=$tolower | \
|
||||
ngram -lm - -vocab "$ngram_vocab" -renorm -write-lm "$newlm" $options
|
||||
134
language_model/srilm-1.7.3/utils/src/classes-to-fsm.gawk
Executable file
134
language_model/srilm-1.7.3/utils/src/classes-to-fsm.gawk
Executable file
@@ -0,0 +1,134 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# usage: classes-to-fsm [symbolic=1] [isymbolfile=ISYMBOLS] [osymbolfile=OSYMBOLS] \
|
||||
# vocab=VOCAB CLASSES > class.fsm
|
||||
#
|
||||
# where ISYMBOLS is the input symbol table, OSYMBOLS is the output symbol table
|
||||
# VOCAB is the word list
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/classes-to-fsm.gawk,v 1.1 1999/09/27 01:10:27 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
empty_input = "NULL";
|
||||
empty_output = "NULL";
|
||||
input_symbols[empty_input] = 0;
|
||||
output_symbols[empty_output] = 0;
|
||||
numinputs = 1;
|
||||
numoutputs = 1;
|
||||
|
||||
isymbolfile = "";
|
||||
osymbolfile = "";
|
||||
symbolic = 0;
|
||||
|
||||
startstate = 0;
|
||||
numstates = 1;
|
||||
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
logscale = 10000.5;
|
||||
round = 0.5;
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
# print start/end state
|
||||
print startstate;
|
||||
|
||||
if (vocab) {
|
||||
while ((getline vline < vocab) > 0) {
|
||||
if (split(vline, a) >= 1) {
|
||||
word = a[1];
|
||||
input_symbols[word] = numinputs ++;
|
||||
output_symbols[word] = numoutputs ++;
|
||||
|
||||
# print identity transition for vocab words
|
||||
print startstate, startstate, \
|
||||
(symbolic ? word : input_symbols[word]), \
|
||||
(symbolic ? word : output_symbols[word]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
function rint(x) {
|
||||
if (x < 0) {
|
||||
return int(x - round);
|
||||
} else {
|
||||
return int(x + round);
|
||||
}
|
||||
}
|
||||
|
||||
function scale_prob(x) {
|
||||
return rint(log(x) * logscale);
|
||||
# return log(x) / M_LN10;
|
||||
}
|
||||
|
||||
# input format is
|
||||
# CLASS [PROB] WORD1 WORD2 ... WORDN
|
||||
{
|
||||
if (NF == 0) {
|
||||
next;
|
||||
}
|
||||
|
||||
class = $1;
|
||||
|
||||
if (!(class in input_symbols)) {
|
||||
input_symbols[class] = numinputs++;
|
||||
}
|
||||
|
||||
if ($2 ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
|
||||
prob = $2;
|
||||
first = 3;
|
||||
} else {
|
||||
prob = 1;
|
||||
first = 2;
|
||||
}
|
||||
|
||||
# deal with empty class expansion: map class to NULL
|
||||
if (first > NF) {
|
||||
print startstate, startstate, \
|
||||
(symbolic ? class : input_symbols[class]), \
|
||||
(symbolic ? empty_output : 0), \
|
||||
-scale_prob(prob);
|
||||
}
|
||||
|
||||
for (i = first; i <= NF; i ++) {
|
||||
if (!($i in output_symbols)) {
|
||||
output_symbols[$i] = numoutputs ++;
|
||||
}
|
||||
|
||||
if (i == NF) {
|
||||
next_state = startstate;
|
||||
} else {
|
||||
next_state = numstates ++;
|
||||
}
|
||||
|
||||
if (i == first) {
|
||||
print startstate, next_state,
|
||||
(symbolic ? class : input_symbols[class]), \
|
||||
(symbolic ? $i : output_symbols[$i]), \
|
||||
-scale_prob(prob);
|
||||
} else {
|
||||
print last_state, next_state,
|
||||
(symbolic ? empty_input : 0), \
|
||||
(symbolic ? $i : output_symbols[$i]), \
|
||||
-scale_prob(1);
|
||||
}
|
||||
|
||||
last_state = next_state;
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
if (isymbolfile) {
|
||||
for (word in input_symbols) {
|
||||
print word, input_symbols[word] > isymbolfile;
|
||||
}
|
||||
close(isymbolfile);
|
||||
}
|
||||
if (osymbolfile) {
|
||||
for (word in output_symbols) {
|
||||
print word, output_symbols[word] > osymbolfile;
|
||||
}
|
||||
close(osymbolfile);
|
||||
}
|
||||
}
|
||||
114
language_model/srilm-1.7.3/utils/src/combine-acoustic-scores.gawk
Executable file
114
language_model/srilm-1.7.3/utils/src/combine-acoustic-scores.gawk
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# combine acoustic scores in nbest lists with additional acoustic score files
|
||||
# (used by rescore-acoustic and nbest-rover)
|
||||
#
|
||||
# Setting the "max_nbest" limits the number of hyps retrieved from each
|
||||
# input score list.
|
||||
# If max_nbest is set and an additional score file contains less values
|
||||
# than the nbest list is long, missing values are filled in with the
|
||||
# minimal score found in that file.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/combine-acoustic-scores.gawk,v 1.9 2019/02/22 20:55:10 stolcke Exp $
|
||||
#
|
||||
function get_from_file(i) {
|
||||
if (ARGV[i] ~ /\.gz$/) {
|
||||
status = (("exec gzip -dc " ARGV[i]) | getline);
|
||||
} else {
|
||||
status = (getline < ARGV[i]);
|
||||
}
|
||||
if (status < 0) {
|
||||
print "error reading from " ARGV[i] >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
hypno = 0;
|
||||
|
||||
sentid = ARGV[1];
|
||||
sub(".*/", "", sentid);
|
||||
sub("\\.gz$", "", sentid);
|
||||
sub("\\.score$", "", sentid);
|
||||
|
||||
bytelogscale = 1024.0 / 10000.5 / 2.30258509299404568402;
|
||||
|
||||
nweights = split(weights, weight);
|
||||
if (nweights != ARGC - 1) {
|
||||
print "number of weights doesn't match number of score files" \
|
||||
>> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
# format of input nbest list
|
||||
nbestformat = 0;
|
||||
|
||||
while ((max_nbest == 0 || hypno < max_nbest) && get_from_file(1)) {
|
||||
|
||||
if ($1 == "NBestList1.0") {
|
||||
nbestformat = 1;
|
||||
print;
|
||||
continue;
|
||||
} else if ($1 == "NBestList2.0") {
|
||||
nbestformat = 2;
|
||||
print;
|
||||
continue;
|
||||
}
|
||||
|
||||
old_ac = $1; $1 = "";
|
||||
if (nbestformat > 0) {
|
||||
# Decipher nbest format: just use the aggregate
|
||||
# score as the acoustic score
|
||||
# For version 2 format, the total score is updated,
|
||||
# reflecting the change in acoustic scores.
|
||||
# Other programs recover the acoustic score as the
|
||||
# difference of the total score and the accumulated
|
||||
# LM scores, so this gives the right results.
|
||||
gsub("[()]", "", old_ac);
|
||||
old_ac *= bytelogscale;
|
||||
}
|
||||
|
||||
hyp = $0;
|
||||
|
||||
total_ac = weight[1] * old_ac;
|
||||
for (i = 2; i < ARGC; i ++) {
|
||||
if (!get_from_file(i)) {
|
||||
if (max_nbest == 0) {
|
||||
print "missing score in " ARGV[i] \
|
||||
>> "/dev/stderr";
|
||||
exit 2
|
||||
} else {
|
||||
new_ac = min_score[i];
|
||||
}
|
||||
} else {
|
||||
# skip nbest header
|
||||
if ($1 ~ /NBestList/) {
|
||||
i --;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_ac = $1;
|
||||
|
||||
# handle decipher-style scores
|
||||
if (new_ac ~ /\(.*\)/) {
|
||||
gsub("[()]", "", new_ac);
|
||||
new_ac *= bytelogscale;
|
||||
}
|
||||
|
||||
# replace minimum score if needed
|
||||
if (!(i in min_score) || $1 < min_score[i]) {
|
||||
min_score[i] = new_ac;
|
||||
}
|
||||
}
|
||||
total_ac += weight[i] * new_ac;
|
||||
}
|
||||
|
||||
if (nbestformat > 0) {
|
||||
total_ac = sprintf("(%f)", total_ac / bytelogscale);
|
||||
}
|
||||
print total_ac hyp;
|
||||
|
||||
hypno ++;
|
||||
}
|
||||
}
|
||||
163
language_model/srilm-1.7.3/utils/src/combine-rover-controls.gawk
Executable file
163
language_model/srilm-1.7.3/utils/src/combine-rover-controls.gawk
Executable file
@@ -0,0 +1,163 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# combine-rover-controls --
|
||||
# combined several rover control files for system combination
|
||||
# (may be used recursively)
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/combine-rover-controls.gawk,v 1.7 2017/08/16 06:34:16 stolcke Exp $
|
||||
#
|
||||
|
||||
function process_rover_control(file, weight, pscale) {
|
||||
|
||||
dir = file;
|
||||
sub("/[^/]*$", "", dir);
|
||||
if (file == dir) {
|
||||
dir = "";
|
||||
}
|
||||
|
||||
while ((status = (getline < file)) > 0) {
|
||||
|
||||
if (NF == 0) continue;
|
||||
|
||||
# skip comment line
|
||||
if (/^##/) continue;
|
||||
|
||||
if (!keep_paths) {
|
||||
# deal with relatve directories in rover-control file:
|
||||
# prepend rover-control directory path
|
||||
if ($1 !~ /^\// && dir != "") {
|
||||
$1 = dir "/" $1;
|
||||
}
|
||||
}
|
||||
|
||||
if ($3 == "+") {
|
||||
system_id = system_id $1 " " $2 " +\n";
|
||||
} else {
|
||||
nsystems += 1;
|
||||
|
||||
# handle missing lmw and wtw and system weights
|
||||
if ($2 == "") $2 = 8;
|
||||
if ($3 == "") $3 = 0;
|
||||
if ($4 == "") $4 = 1;
|
||||
|
||||
# missing nbest depth limit
|
||||
if ($5 == "") nbest_depth[nsystems] = 0;
|
||||
else nbest_depth[nsystems] = $5;
|
||||
|
||||
# override posterior scale if specified
|
||||
if (pscale) system_pscale[nsystems] = pscale;
|
||||
else system_pscale[nsystems] = $6
|
||||
|
||||
system_id = system_id $1 " " $2 " " $3;
|
||||
|
||||
# see if this system has appeared before
|
||||
if (system_id in system_index) {
|
||||
# merge system weights
|
||||
# ensuring weight tying spec is compatible
|
||||
if ($4 == "=") {
|
||||
if (system_weight[system_index[system_id]] != "=") {
|
||||
print "cannot combine weight tying" > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
if (system_weight[system_index[system_id]] == "=") {
|
||||
print "cannot combine weight tying" > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
system_weight[system_index[system_id]] += $4 * weight;
|
||||
}
|
||||
|
||||
# skip the duplicate system
|
||||
nsystems -= 1;
|
||||
} else {
|
||||
# divide system weight by total number of input files
|
||||
# but preserve weight tying info
|
||||
if ($4 == "=") {
|
||||
system_weight[nsystems] = $4;
|
||||
} else {
|
||||
system_weight[nsystems] = $4 * weight;
|
||||
}
|
||||
|
||||
system_dirs_weights[nsystems] = system_id;
|
||||
|
||||
system_index[system_id] = nsystems;
|
||||
}
|
||||
|
||||
system_id = "";
|
||||
}
|
||||
}
|
||||
|
||||
if (status < 0) {
|
||||
print file ": " ERRNO > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
close(file);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
arg_offset = 0;
|
||||
ninputs = ARGC - 1;
|
||||
nsystems = 0;
|
||||
|
||||
while (1) {
|
||||
if (ARGV[arg_offset+1] ~ /^lambda=/) {
|
||||
lambda = substr(ARGV[arg_offset+1], length("lambda")+2);
|
||||
ninputs -= 1;
|
||||
arg_offset += 1;
|
||||
} else if (ARGV[arg_offset+1] ~ /^postscale=/) {
|
||||
postscale = substr(ARGV[arg_offset+1], length("postscale")+2);
|
||||
ninputs -= 1;
|
||||
arg_offset += 1;
|
||||
} else if (ARGV[arg_offset+1] ~ /^norm=/) {
|
||||
norm_weights = substr(ARGV[arg_offset+1], length("norm")+2);
|
||||
ninputs -= 1;
|
||||
arg_offset += 1;
|
||||
} else if (ARGV[arg_offset+1] ~ /^keeppaths=/) {
|
||||
keep_paths = substr(ARGV[arg_offset+1], length("keeppaths")+2);
|
||||
ninputs -= 1;
|
||||
arg_offset += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ninputs < 1) {
|
||||
print "usage: " ARGV[0] " [lambda=WEIGHTS] [postscale=S] ROVER-CTRL1 ROVER-CTRL2 ..." \
|
||||
>> "/dev/stderr";
|
||||
exit(2);
|
||||
}
|
||||
|
||||
# initialize priors from lambdas
|
||||
nlambdas = split(lambda, lambdas);
|
||||
lambda_sum = 0.0;
|
||||
for (i = 1; i <= nlambdas; i ++) {
|
||||
lambda_sum += lambdas[i];
|
||||
}
|
||||
# fill in the missing lambdas with uniform values
|
||||
for (i = nlambdas + 1; i <= ninputs; i ++) {
|
||||
lambdas[i] = (1 - lambda_sum)/(ninputs - nlambdas);
|
||||
}
|
||||
|
||||
for (i = 1; i <= ninputs; i ++) {
|
||||
process_rover_control(ARGV[arg_offset + i], lambdas[i], postscale);
|
||||
}
|
||||
|
||||
if (norm_weights) {
|
||||
weight_sum = 0;
|
||||
for (i = 1; i <= nsystems; i ++) {
|
||||
weight_sum += system_weight[i];
|
||||
}
|
||||
for (i = 1; i <= nsystems; i ++) {
|
||||
system_weight[i] /= weight_sum;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 1; i <= nsystems; i ++) {
|
||||
print system_dirs_weights[i], system_weight[i], nbest_depth[i], system_pscale[i];
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
92
language_model/srilm-1.7.3/utils/src/compare-ppls.gawk
Executable file
92
language_model/srilm-1.7.3/utils/src/compare-ppls.gawk
Executable file
@@ -0,0 +1,92 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# compare-ppls --
|
||||
# Compare two LMs for significant differences in probabilities
|
||||
# The probabilities calculated for the test set words are ranked
|
||||
# pairwise, as appropriate for submitting the result a sign test.
|
||||
#
|
||||
# usage: compare-ppls [mindelta=d] pplout1 pplout2
|
||||
#
|
||||
# where pplout1, pplout2 is the output of ngram -debug 2 -ppl for the two
|
||||
# models. d is the minimum difference of logprobs for two probs to
|
||||
# be considered different.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compare-ppls.gawk,v 1.6 2014-07-03 05:57:09 stolcke Exp $
|
||||
#
|
||||
function abs(x) {
|
||||
return (x < 0) ? -x : x;
|
||||
}
|
||||
BEGIN {
|
||||
sampleA_no = 0;
|
||||
sampleB_no = 0;
|
||||
mindelta = 0;
|
||||
verbose = 0;
|
||||
signif = 0;
|
||||
|
||||
diff_sum = 0;
|
||||
diff_squared_sum = 0;
|
||||
|
||||
logINF = -100000;
|
||||
}
|
||||
FNR == 1 {
|
||||
if (!readingA) {
|
||||
readingA = 1;
|
||||
} else {
|
||||
readingA = 0;
|
||||
}
|
||||
}
|
||||
readingA && $1 == "p(" {
|
||||
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) prob = logINF;
|
||||
else prob = $10;
|
||||
|
||||
sampleA[sampleA_no ++] = prob;
|
||||
}
|
||||
!readingA && $1 == "p(" {
|
||||
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) prob = logINF;
|
||||
else prob = $10;
|
||||
|
||||
if (sampleB_no > sampleA_no) {
|
||||
printf "sample B contains more data than sample A" >> "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
diff = sampleA[sampleB_no] - prob;
|
||||
|
||||
if (abs(diff) <= mindelta) {
|
||||
equal ++;
|
||||
} else {
|
||||
diff_sum += diff;
|
||||
diff_squared_sum += diff * diff;
|
||||
|
||||
if (diff < 0) {
|
||||
if (verbose) {
|
||||
print;
|
||||
}
|
||||
greater ++;
|
||||
}
|
||||
}
|
||||
|
||||
sampleB_no ++;
|
||||
}
|
||||
END {
|
||||
if (sampleB_no < sampleA_no) {
|
||||
printf "sample B contains less data than sample A" >> "/dev/stderr";
|
||||
print sampleB_no, sampleA_no;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
mean_diff = diff_sum / sampleA_no;
|
||||
mean_sq_error = diff_squared_sum / sampleA_no - mean_diff * mean_diff;
|
||||
stdev = sqrt(mean_sq_error);
|
||||
|
||||
printf "total %d, equal %d, different %d, greater %d\n", \
|
||||
sampleB_no, equal, sampleB_no - equal, greater;
|
||||
printf "meandiff %g, mse %g, stdev %g\n", \
|
||||
mean_diff, mean_sq_error, stdev;
|
||||
|
||||
if (signif) {
|
||||
printf "significance:\n";
|
||||
less = sampleB_no - equal - greater;
|
||||
system("cumbin " (less+greater) " " (less>greater ? less : greater));
|
||||
}
|
||||
}
|
||||
131
language_model/srilm-1.7.3/utils/src/compare-sclite
Executable file
131
language_model/srilm-1.7.3/utils/src/compare-sclite
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# compare-sclite --
|
||||
# compare sclite word error sentence-by-sentence
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compare-sclite,v 1.26 2017/08/12 05:48:34 stolcke Exp $
|
||||
#
|
||||
|
||||
# enforce proper sorting order
|
||||
LC_COLLATE=C
|
||||
export LC_COLLATE
|
||||
|
||||
if [ $# -lt 3 ]; then
|
||||
echo "usage: $0 [-v] -h1 hyps1 -h2 hyps2 -r refs [-S id-subset] [-M|-multiwords] [sclite-options ...]" >&2
|
||||
echo " or $0 hyps1 hyps2 refs" >&2
|
||||
exit 2
|
||||
elif [ $# -eq 3 ]; then
|
||||
# old syntax
|
||||
hypsA=${1}
|
||||
hypsB=${2}
|
||||
refs=${3}
|
||||
else
|
||||
# parse arguments
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-r) refs=$2; shift ;;
|
||||
-h1) hypsA=$2; shift ;;
|
||||
-h2) hypsB=$2; shift ;;
|
||||
-S) options="$options -S $2"; shift ;;
|
||||
*) options="$options $1" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
fi
|
||||
|
||||
tmpdir=${TMPDIR-/tmp}
|
||||
pralignA=pralignA$$
|
||||
pralignB=pralignB$$
|
||||
subset="$tmpdir/subset$$"
|
||||
|
||||
trap '/bin/rm -f $tmpdir/$pralignA.pra $tmpdir/$pralignB.pra $subset.*' 0 1 2 13 15
|
||||
|
||||
set -e
|
||||
|
||||
#
|
||||
# use the intersection of the too hyp sets and (if specified) the -S set
|
||||
#
|
||||
case "$hypsA" in
|
||||
*.ctm) case "$hypsB" in
|
||||
*.ctm) ${GAWK-gawk} '{ print $1 "_" $2 }' < "$hypsA" | sort -u > $subset.A
|
||||
${GAWK-gawk} '{ print $1 "_" $2 }' < "$hypsB" | sort -u > $subset.B
|
||||
;;
|
||||
*) echo "both hyps must be in same format" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
*) case "$hypsB" in
|
||||
*.ctm) echo "both hyps must be in same format" >&2
|
||||
exit 2
|
||||
;;
|
||||
*) ${GAWK-gawk} '{ print $1 }' < "$hypsA" | sort -u > $subset.A
|
||||
${GAWK-gawk} '{ print $1 }' < "$hypsB" | sort -u > $subset.B
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
esac
|
||||
|
||||
comm -12 $subset.A $subset.B > $subset.AB
|
||||
options="$options -S $subset.AB"
|
||||
|
||||
#
|
||||
# generate alignments for the two hyp sets
|
||||
#
|
||||
compute-sclite -h "$hypsA" -r "$refs" $options -O $tmpdir -n $pralignA -o pralign
|
||||
compute-sclite -h "$hypsB" -r "$refs" $options -O $tmpdir -n $pralignB -o pralign
|
||||
|
||||
#
|
||||
# compute error totals by utterance and compare
|
||||
#
|
||||
${GAWK-gawk} '
|
||||
BEGIN {
|
||||
less = greater = equal = 0;
|
||||
}
|
||||
$1 == "id:" {
|
||||
sentid = $2;
|
||||
sub("^\\(", "", sentid);
|
||||
sub("\\)$", "", sentid);
|
||||
next;
|
||||
}
|
||||
$1 == "Scores:" {
|
||||
corr = $6;
|
||||
subs = $7;
|
||||
dels = $8;
|
||||
inss = $9;
|
||||
|
||||
words = corr + subs + dels;
|
||||
errs = subs + dels + inss;
|
||||
|
||||
if (errors[sentid] == "") {
|
||||
errors[sentid] = errs;
|
||||
total_wordsA += words;
|
||||
total_errsA += errs
|
||||
total_sentsA ++;
|
||||
} else {
|
||||
if (errs > errors[sentid]) greater++;
|
||||
else if (errs < errors[sentid]) less++;
|
||||
else equal++;
|
||||
total_wordsB += words;
|
||||
total_errsB += errs;
|
||||
total_sentsB ++;
|
||||
}
|
||||
next;
|
||||
}
|
||||
END {
|
||||
werA = (total_wordsA > 0 ? total_errsA/total_wordsA * 100 : 0);
|
||||
werB = (total_wordsB > 0 ? total_errsB/total_wordsB * 100 : 0);
|
||||
|
||||
printf "result 1: %d errors (%.2f%%), %d words, %d sentences\n", \
|
||||
total_errsA, werA, total_wordsA, total_sentsA;
|
||||
printf "result 2: %d errors (%.2f%%), %d words, %d sentences\n", \
|
||||
total_errsB, werB, total_wordsB, total_sentsB;
|
||||
printf "less %d, greater %d, equal %d, different %d (%+.2f%%)\n", \
|
||||
less, greater, equal, less + greater, werB - werA;
|
||||
if (less + greater > 0) {
|
||||
printf "significance:\n"
|
||||
system("cumbin " (less+greater) " " (less>greater ? less : greater));
|
||||
}
|
||||
}
|
||||
' $tmpdir/$pralignA.pra $tmpdir/$pralignB.pra
|
||||
|
||||
181
language_model/srilm-1.7.3/utils/src/compute-best-mix.gawk
Executable file
181
language_model/srilm-1.7.3/utils/src/compute-best-mix.gawk
Executable file
@@ -0,0 +1,181 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# compute-best-mix --
|
||||
# Compute the best mixture weight (-lambda) for interpolating N
|
||||
# LMs.
|
||||
#
|
||||
# usage: compute-best-mix [lambda="l1 l2 ..."] [precision=p] pplout1 pplout2 ...
|
||||
#j
|
||||
# where pplout1, pplout2, ... is the output of ngram -debug 2 -ppl for the
|
||||
# models. li are initial guesses at the mixture weights, and p is the
|
||||
# precision with which the best lambda vector is to be found.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-mix.gawk,v 1.13 2017/12/22 01:34:49 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
verbose = 0;
|
||||
|
||||
lambda = "0.5";
|
||||
precision = 0.001;
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
|
||||
logINF = -320;
|
||||
|
||||
unk = "<unk>";
|
||||
}
|
||||
function abs(x) {
|
||||
return (x < 0) ? -x : x;
|
||||
}
|
||||
function log10(x) {
|
||||
return log(x) / M_LN10;
|
||||
}
|
||||
function exp10(x) {
|
||||
if (x < logINF) {
|
||||
return 0;
|
||||
} else {
|
||||
return exp(x * M_LN10);
|
||||
}
|
||||
}
|
||||
function addlogs(x,y) {
|
||||
if (x<y) {
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
return x + log10(1 + exp10(y - x));
|
||||
}
|
||||
|
||||
function print_vector(x, n) {
|
||||
result = "(" x[1];
|
||||
for (k = 2; k <= n; k++) {
|
||||
result = result " " x[k];
|
||||
}
|
||||
return result ")"
|
||||
}
|
||||
|
||||
function print_vector_pairwise(x, n) {
|
||||
total_lambda = x[1];
|
||||
result = "(" 1;
|
||||
for (k = 2; k <= n; k++) {
|
||||
total_lambda += x[k];
|
||||
result = result " " x[k]/total_lambda;
|
||||
}
|
||||
return result ")"
|
||||
}
|
||||
|
||||
FNR == 1 {
|
||||
nfiles ++;
|
||||
}
|
||||
$1 == "p(" {
|
||||
|
||||
word = $2;
|
||||
# Canonicalize input to have at most one representative context word;
|
||||
sub("[|] [^)]*)", "| X )");
|
||||
$0 = $0;
|
||||
|
||||
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) {
|
||||
prob = logINF;
|
||||
} else {
|
||||
prob = $10;
|
||||
}
|
||||
|
||||
# If a count is given.
|
||||
if ($11 ~ /^[*]/) {
|
||||
count = substr($11,2);
|
||||
} else {
|
||||
count = 1;
|
||||
}
|
||||
|
||||
sample_no = ++ nsamples[nfiles];
|
||||
samples[nfiles " " sample_no] = prob;
|
||||
counts[sample_no] = count;
|
||||
|
||||
if (sample_no in words) {
|
||||
if (word != words[sample_no] && word != unk && words[sample_no] != unk) {
|
||||
print "warning: word mismatch in file " FILENAME ", token " sample_no \
|
||||
": " word " != " words[sample_no] > "/dev/stderr";
|
||||
}
|
||||
} else {
|
||||
words[sample_no] = word;
|
||||
}
|
||||
}
|
||||
END {
|
||||
for (i = 2; i <= nfiles; i ++) {
|
||||
if (nsamples[i] != nsamples[1]) {
|
||||
printf "mismatch in number of samples (%d != %d)", \
|
||||
nsamples[1], nsamples[i] >> "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
last_prior = 0.0;
|
||||
|
||||
# initialize priors from lambdas
|
||||
nlambdas = split(lambda, lambdas);
|
||||
lambda_sum = 0.0;
|
||||
for (i = 1; i <= nlambdas; i ++) {
|
||||
priors[i] = lambdas[i];
|
||||
lambda_sum += lambdas[i];
|
||||
}
|
||||
# fill in the missing lambdas
|
||||
for (i = nlambdas + 1; i <= nfiles; i ++) {
|
||||
priors[i] = (1 - lambda_sum)/(nfiles - nlambdas);
|
||||
}
|
||||
|
||||
iter = 0;
|
||||
have_converged = 0;
|
||||
while (!have_converged) {
|
||||
iter ++;
|
||||
|
||||
num_oovs = num_words = 0;
|
||||
delete post_totals;
|
||||
log_like = 0;
|
||||
|
||||
for (j = 1; j <= nsamples[1]; j ++) {
|
||||
|
||||
all_inf = 1;
|
||||
for (i = 1; i <= nfiles; i ++) {
|
||||
sample = samples[i " " j];
|
||||
logpost[i] = log10(priors[i]) + sample;
|
||||
all_inf = all_inf && (sample == logINF);
|
||||
if (i == 1) {
|
||||
logsum = logpost[i];
|
||||
} else {
|
||||
logsum = addlogs(logsum, logpost[i]);
|
||||
}
|
||||
}
|
||||
|
||||
# skip OOV words
|
||||
if (all_inf) {
|
||||
num_oovs += counts[j];
|
||||
continue;
|
||||
}
|
||||
|
||||
num_words += counts[j];
|
||||
log_like += logsum * counts[j];
|
||||
|
||||
for (i = 1; i <= nfiles; i ++) {
|
||||
post_totals[i] += exp10(logpost[i] - logsum) * counts[j];
|
||||
}
|
||||
}
|
||||
|
||||
printf "iteration %d, lambda = %s, ppl = %g\n", \
|
||||
iter, print_vector(priors, nfiles), \
|
||||
exp10(-log_like/num_words) >> "/dev/stderr";
|
||||
fflush();
|
||||
|
||||
|
||||
have_converged = 1;
|
||||
for (i = 1; i <= nfiles; i ++) {
|
||||
last_prior = priors[i];
|
||||
priors[i] = post_totals[i]/num_words;
|
||||
|
||||
if (abs(last_prior - priors[i]) > precision) {
|
||||
have_converged = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf "%d non-oov words, best lambda %s\n",
|
||||
num_words, print_vector(priors, nfiles);
|
||||
printf "pairwise cumulative lambda %s\n",
|
||||
print_vector_pairwise(priors, nfiles);
|
||||
}
|
||||
166
language_model/srilm-1.7.3/utils/src/compute-best-rover-mix.gawk
Executable file
166
language_model/srilm-1.7.3/utils/src/compute-best-rover-mix.gawk
Executable file
@@ -0,0 +1,166 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# compute-best-rover-mix --
|
||||
# Compute the best mixture weight for combining multiple sausages
|
||||
#
|
||||
# usage: compute-best-rover-mix [lambda="l1 l2 ..."] [addone=N] [precision=p] nbest-rover-ref-posteriors-output
|
||||
#
|
||||
# where the input is the output of nbest-rover -write-ref-posteriors .
|
||||
# li are initial guesses at the mixture weights, and p is the
|
||||
# precision with which the best lambda vector is to be found.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-rover-mix.gawk,v 1.6 2016-12-10 07:06:41 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
verbose = 0;
|
||||
|
||||
lambda = "0.5";
|
||||
addone = 0;
|
||||
precision = 0.001;
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
|
||||
logINF = -320;
|
||||
|
||||
zero_probs = 0;
|
||||
}
|
||||
function abs(x) {
|
||||
return (x < 0) ? -x : x;
|
||||
}
|
||||
function log10(x) {
|
||||
return log(x) / M_LN10;
|
||||
}
|
||||
function exp10(x) {
|
||||
if (x < logINF) {
|
||||
return 0;
|
||||
} else {
|
||||
return exp(x * M_LN10);
|
||||
}
|
||||
}
|
||||
function addlogs(x,y) {
|
||||
if (x<y) {
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
return x + log10(1 + exp10(y - x));
|
||||
}
|
||||
|
||||
function print_vector(x, n) {
|
||||
result = x[1];
|
||||
for (k = 2; k <= n; k++) {
|
||||
result = result " " x[k];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
{
|
||||
nsystems = NF - 4;
|
||||
|
||||
if ($4 == 0) {
|
||||
zero_probs ++;
|
||||
} else {
|
||||
sample_no ++;
|
||||
|
||||
for (i = 1; i <= nsystems; i++) {
|
||||
samples[i " " sample_no] = $(i + 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
last_prior = 0.0;
|
||||
|
||||
# initialize priors from lambdas
|
||||
nlambdas = split(lambda, lambdas);
|
||||
lambda_sum = 0.0;
|
||||
for (i = 1; i <= nlambdas; i ++) {
|
||||
priors[i] = lambdas[i];
|
||||
lambda_sum += lambdas[i];
|
||||
}
|
||||
# fill in the missing lambdas
|
||||
for (i = nlambdas + 1; i <= nsystems; i ++) {
|
||||
priors[i] = (1 - lambda_sum)/(nsystems - nlambdas);
|
||||
}
|
||||
|
||||
# set up weight tying - assign input systems (weights) to tying bins
|
||||
if (tying) {
|
||||
ntying = split(tying, tying_bins);
|
||||
for (i = 1; i <= ntying && i <= nsystems; i ++) {
|
||||
this_bin = int(tying_bins[i]);
|
||||
if (this_bin <= 0) {
|
||||
print "invalid tying bin: " tying_bins[i];
|
||||
exit(1);
|
||||
}
|
||||
binfor[i] = this_bin;
|
||||
weights_in_bin[this_bin] += 1;
|
||||
|
||||
if (this_bin > nbins) nbins = this_bin;
|
||||
}
|
||||
} else {
|
||||
i = 1;
|
||||
nbins = 0;
|
||||
}
|
||||
# assign unique bins for weights not covered in tying argument string
|
||||
for ( ; i <= nsystems; i ++) {
|
||||
binfor[i] = ++nbins;
|
||||
weights_in_bin[nbins] = 1;
|
||||
}
|
||||
|
||||
|
||||
iter = 0;
|
||||
have_converged = 0;
|
||||
while (!have_converged) {
|
||||
iter ++;
|
||||
|
||||
num_words = 0;
|
||||
delete post_totals;
|
||||
log_like = 0;
|
||||
|
||||
for (j = 1; j <= sample_no; j ++) {
|
||||
|
||||
all_inf = 1;
|
||||
for (i = 1; i <= nsystems; i ++) {
|
||||
sample = log10(samples[i " " j]);
|
||||
logpost[i] = log10(priors[i]) + sample;
|
||||
all_inf = all_inf && (sample == logINF);
|
||||
if (i == 1) {
|
||||
logsum = logpost[i];
|
||||
} else {
|
||||
logsum = addlogs(logsum, logpost[i]);
|
||||
}
|
||||
}
|
||||
|
||||
# skip OOV words
|
||||
if (all_inf) {
|
||||
continue;
|
||||
}
|
||||
|
||||
num_words ++;
|
||||
log_like += logsum;
|
||||
|
||||
# total up the posteriors for each weight bin
|
||||
for (i = 1; i <= nsystems; i ++) {
|
||||
post_totals[binfor[i]] += exp10(logpost[i] - logsum);
|
||||
}
|
||||
}
|
||||
printf "iteration %d, lambda = %s, ppl = %g\n", \
|
||||
iter, print_vector(priors, nsystems), \
|
||||
exp10(-log_like/num_words) >> "/dev/stderr";
|
||||
fflush();
|
||||
|
||||
|
||||
have_converged = 1;
|
||||
for (i = 1; i <= nsystems; i ++) {
|
||||
last_prior = priors[i];
|
||||
priors[i] = (post_totals[binfor[i]]/weights_in_bin[binfor[i]] + addone)/(num_words + nsystems * addone);
|
||||
|
||||
if (abs(last_prior - priors[i]) > precision) {
|
||||
have_converged = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
weights = print_vector(priors, nsystems);
|
||||
printf "%d alignment positions, best lambda (%s)\n", num_words, weights;
|
||||
if (write_weights) {
|
||||
print weights > write_weights;
|
||||
}
|
||||
}
|
||||
159
language_model/srilm-1.7.3/utils/src/compute-best-sentence-mix.gawk
Executable file
159
language_model/srilm-1.7.3/utils/src/compute-best-sentence-mix.gawk
Executable file
@@ -0,0 +1,159 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# compute-best-sentence-mix --
|
||||
# Compute the best sentence-level mixture weight for interpolating N
|
||||
# LMs.
|
||||
#
|
||||
# usage: compute-best-sentence-mix [lambda="l1 l2 ..."] [addone=N] [precision=p] pplout1 pplout2 ...
|
||||
#
|
||||
# where pplout1, pplout2, ... is the output of ngram -debug 1 -ppl for the
|
||||
# models. li are initial guesses at the mixture weights, and p is the
|
||||
# precision with which the best lambda vector is to be found.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compute-best-sentence-mix.gawk,v 1.4 2016/06/01 20:20:38 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
verbose = 0;
|
||||
|
||||
lambda = "0.5";
|
||||
addone = 0;
|
||||
precision = 0.001;
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
|
||||
logINF = -320;
|
||||
}
|
||||
function abs(x) {
|
||||
return (x < 0) ? -x : x;
|
||||
}
|
||||
function log10(x) {
|
||||
return log(x) / M_LN10;
|
||||
}
|
||||
function exp10(x) {
|
||||
if (x < logINF) {
|
||||
return 0;
|
||||
} else {
|
||||
return exp(x * M_LN10);
|
||||
}
|
||||
}
|
||||
function addlogs(x,y) {
|
||||
if (x<y) {
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
return x + log10(1 + exp10(y - x));
|
||||
}
|
||||
|
||||
function print_vector(x, n) {
|
||||
result = "(" x[1];
|
||||
for (k = 2; k <= n; k++) {
|
||||
result = result " " x[k];
|
||||
}
|
||||
return result ")"
|
||||
}
|
||||
|
||||
FNR == 1 {
|
||||
nfiles ++;
|
||||
num_words = 0;
|
||||
num_sentences = 0;
|
||||
}
|
||||
|
||||
# 1 sentences, 6 words, 0 OOVs
|
||||
/^1 sentences, [0-9]* words, [0-9]* OOVs/ {
|
||||
# exclude OOVs
|
||||
num_words += $3 - $5;
|
||||
expect_logprob = 1;
|
||||
}
|
||||
|
||||
# 0 zeroprobs, logprob= -22.9257 ppl= 1884.06 ppl1= 6621.32
|
||||
/^[0-9]* zeroprobs, logprob= / && expect_logprob {
|
||||
|
||||
# exclude zero prob words
|
||||
num_words -= $1;
|
||||
num_sentences += 1;
|
||||
|
||||
if ($4 ~ /-[Ii]nf|-1\.#INF/) {
|
||||
prob = logINF;
|
||||
} else {
|
||||
prob = $4;
|
||||
}
|
||||
|
||||
sample_no = ++ nsamples[nfiles];
|
||||
samples[nfiles " " sample_no] = prob;
|
||||
|
||||
expect_logprob = 0;
|
||||
}
|
||||
END {
|
||||
for (i = 2; i <= nfiles; i ++) {
|
||||
if (nsamples[i] != nsamples[1]) {
|
||||
printf "mismatch in number of samples (%d != %d)", \
|
||||
nsamples[1], nsamples[i] >> "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
last_prior = 0.0;
|
||||
|
||||
# initialize priors from lambdas
|
||||
nlambdas = split(lambda, lambdas);
|
||||
lambda_sum = 0.0;
|
||||
for (i = 1; i <= nlambdas; i ++) {
|
||||
priors[i] = lambdas[i];
|
||||
lambda_sum += lambdas[i];
|
||||
}
|
||||
# fill in the missing lambdas
|
||||
for (i = nlambdas + 1; i <= nfiles; i ++) {
|
||||
priors[i] = (1 - lambda_sum)/(nfiles - nlambdas);
|
||||
}
|
||||
|
||||
iter = 0;
|
||||
have_converged = 0;
|
||||
while (!have_converged) {
|
||||
iter ++;
|
||||
|
||||
delete post_totals;
|
||||
log_like = 0;
|
||||
|
||||
for (j = 1; j <= nsamples[1]; j ++) {
|
||||
|
||||
all_inf = 1;
|
||||
for (i = 1; i <= nfiles; i ++) {
|
||||
sample = samples[i " " j];
|
||||
logpost[i] = log10(priors[i]) + sample;
|
||||
all_inf = all_inf && (sample == logINF);
|
||||
if (i == 1) {
|
||||
logsum = logpost[i];
|
||||
} else {
|
||||
logsum = addlogs(logsum, logpost[i]);
|
||||
}
|
||||
}
|
||||
|
||||
# skip OOV words
|
||||
if (all_inf) {
|
||||
continue;
|
||||
}
|
||||
|
||||
log_like += logsum;
|
||||
|
||||
for (i = 1; i <= nfiles; i ++) {
|
||||
post_totals[i] += exp10(logpost[i] - logsum);
|
||||
}
|
||||
}
|
||||
printf "iteration %d, lambda = %s, ppl = %g\n", \
|
||||
iter, print_vector(priors, nfiles), \
|
||||
exp10(-log_like/(num_words + num_sentences)) \
|
||||
>> "/dev/stderr";
|
||||
fflush();
|
||||
|
||||
have_converged = 1;
|
||||
for (i = 1; i <= nfiles; i ++) {
|
||||
last_prior = priors[i];
|
||||
priors[i] = (post_totals[i] + addone)/(num_sentences + nfiles * addone);
|
||||
|
||||
if (abs(last_prior - priors[i]) > precision) {
|
||||
have_converged = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf "%d sentences, %d non-oov words, best lambda %s\n",
|
||||
num_sentences, num_words, print_vector(priors, nfiles);
|
||||
}
|
||||
81
language_model/srilm-1.7.3/utils/src/compute-oov-rate.gawk
Executable file
81
language_model/srilm-1.7.3/utils/src/compute-oov-rate.gawk
Executable file
@@ -0,0 +1,81 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# compute-oov-rate --
|
||||
# Compute OOV word rate from a vocabulary and a unigram count file
|
||||
#
|
||||
# usage: compute-oov-rate vocab countfile ...
|
||||
#
|
||||
# Assumes unigram counts do not have repeated words.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compute-oov-rate.gawk,v 1.10 2018/01/24 03:35:38 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
# high bit characters also detect multibyte characters
|
||||
letter = "[[:alpha:]\x80-\xFF]";
|
||||
if ("x" !~ letter) letter = "[A-Za-z\x80-\xFF]";
|
||||
}
|
||||
|
||||
# Read vocab
|
||||
#
|
||||
ARGIND == 1 {
|
||||
vocab[$1] = 1;
|
||||
}
|
||||
|
||||
function is_fragment(word) {
|
||||
return word ~ (letter "-$") || word ~ ("^-" letter);
|
||||
}
|
||||
|
||||
#
|
||||
# Read counts
|
||||
#
|
||||
ARGIND > 1 {
|
||||
if ($1 == "<s>" || $1 == "</s>" || $1 == "-pau-") {
|
||||
next;
|
||||
}
|
||||
|
||||
total_count += $2;
|
||||
total_types ++;
|
||||
|
||||
if (!vocab[$1]) {
|
||||
oov_count += $2;
|
||||
oov_types ++;
|
||||
|
||||
if (debug) {
|
||||
print "OOV: " $1, $2 > "/dev/stderr";
|
||||
}
|
||||
|
||||
if (!is_fragment($1)) {
|
||||
if (write_oov_words) {
|
||||
print > write_oov_words;
|
||||
}
|
||||
} else {
|
||||
if (write_oov_frags) {
|
||||
print > write_oov_frags;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_fragment($1)) {
|
||||
total_nofrag_count += $2;
|
||||
total_nofrag_types ++;
|
||||
|
||||
if (!vocab[$1]) {
|
||||
oov_nofrag_count += $2;
|
||||
oov_nofrag_types ++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
END {
|
||||
printf "OOV tokens: %d / %d (%.2f%%) ", \
|
||||
oov_count, total_count, total_count == 0 ? 0 : 100 * oov_count/total_count;
|
||||
printf "excluding fragments: %d / %d (%.2f%%)\n", \
|
||||
oov_nofrag_count, total_nofrag_count, \
|
||||
total_nofrag_count == 0 ? 0 : 100 * oov_nofrag_count/total_nofrag_count;
|
||||
printf "OOV types: %d / %d (%.2f%%) ", \
|
||||
oov_types, total_types, total_types == 0 ? 0 : 100 * oov_types/total_types;
|
||||
printf "excluding fragments: %d / %d (%.2f%%)\n", \
|
||||
oov_nofrag_types, total_nofrag_types, \
|
||||
total_nofrag_types == 0 ? 0 : 100 * oov_nofrag_types/total_nofrag_types;
|
||||
}
|
||||
252
language_model/srilm-1.7.3/utils/src/compute-sclite
Executable file
252
language_model/srilm-1.7.3/utils/src/compute-sclite
Executable file
@@ -0,0 +1,252 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# compute-sclite --
|
||||
# compute word error rate from a sentid hyp file and a sentid reference
|
||||
# file, using the NIST 'sclite' program
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compute-sclite,v 1.49 2016/09/23 20:05:51 stolcke Exp $
|
||||
#
|
||||
|
||||
# enforce proper sorting order
|
||||
LC_COLLATE=C
|
||||
export LC_COLLATE
|
||||
|
||||
reject="@reject@"
|
||||
|
||||
sclite=sclite
|
||||
|
||||
subsets=
|
||||
remove_periods=
|
||||
format_sentids=1
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "usage: $0 [-v] -h hyps -r refs [-S id-subset] [-M|-multiwords] [-noperiods] [-g glm-file] [sclite-options ...]" >&2
|
||||
echo " or $0 hyps refs" >&2
|
||||
exit 2
|
||||
elif [ $# -eq 2 ]; then
|
||||
# old syntax
|
||||
hyps=${1}
|
||||
refs=${2}
|
||||
else
|
||||
# parse arguments
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-v) verbose=1 ;;
|
||||
-r) refs=$2; shift ;;
|
||||
-h) hyps="$hyps $2"
|
||||
name=`basename $2`
|
||||
shift ;;
|
||||
-S) subsets="$subsets $2"; shift ;;
|
||||
-M|-multiwords)
|
||||
multiwords=1 ;;
|
||||
-noperiods)
|
||||
remove_periods=1 ;;
|
||||
-H) remove_hesitations=1 ;;
|
||||
-keep_bracketed)
|
||||
keep_bracketed=1 ;;
|
||||
-R) reject="<>" ;;
|
||||
-g) glmfile=$2; shift ;;
|
||||
-s) case_sensitive=1 ;;
|
||||
-overlap-limit)
|
||||
options="$options $1 $2"
|
||||
sclite=asclite
|
||||
shift;;
|
||||
-raw-sentids)
|
||||
format_sentids=0
|
||||
;;
|
||||
*) options="$options $1" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
fi
|
||||
|
||||
if [ -n "$case_sensitive" ]; then
|
||||
filter_options="-s";
|
||||
options="$options -s";
|
||||
fi
|
||||
|
||||
tmpdir=${TMPDIR-/tmp}
|
||||
sentids="$tmpdir/ce.sentids$$"
|
||||
speakers="$tmpdir/ce.speakers$$"
|
||||
sortedrefs="$tmpdir/ce.refs$$"
|
||||
sortedhyps="$tmpdir/ce.hyps$$"
|
||||
ignorehyps="$tmpdir/ce.ign$$"
|
||||
|
||||
if [ -z "$verbose" ]; then
|
||||
trap '/bin/rm -f $sentids $speakers $sortedrefs $sortedhyps $ignorehyps' \
|
||||
0 1 2 13 15
|
||||
fi
|
||||
|
||||
set -e
|
||||
|
||||
multijoin () {
|
||||
if [ $# -eq 1 ]; then
|
||||
cat $1
|
||||
else
|
||||
join $1 $2 | { shift; shift; multijoin - "$@"; }
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# extract and sort sentids from hyps
|
||||
# (for CTM hyps these are just waveform/channel labels)
|
||||
#
|
||||
|
||||
case "$hyps" in
|
||||
*.ctm)
|
||||
cat $hyps | \
|
||||
${GAWK-gawk} '!/^;;/ && $7 != "non-lex" && $7 != "fp" { print $1 "_" $2 }' ;;
|
||||
*) cat $hyps | ${GAWK-gawk} '{ print $1 }' ;;
|
||||
esac | \
|
||||
sort | \
|
||||
multijoin - $subsets > $sentids
|
||||
|
||||
#
|
||||
# extract list of "speakers" (waveform/channel labels)
|
||||
#
|
||||
case "$hyps" in
|
||||
*.ctm)
|
||||
cat $sentids | uniq | tr '[A-Z]' '[a-z]' | sort > $speakers
|
||||
;;
|
||||
*) sed 's,\([-_][ABab12]\)[-_].*,\1,' $sentids | uniq | \
|
||||
tr '[A-Z]' '[a-z]' | sort > $speakers
|
||||
;;
|
||||
esac
|
||||
|
||||
#
|
||||
# extract and sort refs for these sentids
|
||||
#
|
||||
case "$refs" in
|
||||
*.stm) # NIST scoring:
|
||||
# filter out speakers not occurring in hyp file
|
||||
${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \
|
||||
sort -k 1,1 -k 5,5n | \
|
||||
join - $speakers | \
|
||||
${GAWK-gawk} '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \
|
||||
if [ -n "$glmfile" ]; then
|
||||
${GAWK-gawk} '{ gsub("-","_",$1); gsub("-","_",$3); print }' | \
|
||||
csrfilt.sh $filter_options -i stm -t ref -dh $glmfile
|
||||
else
|
||||
cat
|
||||
fi > $sortedrefs
|
||||
;;
|
||||
*.stm.filt) # NIST scoring with pre-filtered references
|
||||
# filter out speakers not occurring in hyp file
|
||||
${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \
|
||||
sort -k 1,1 -k 5,5n | \
|
||||
join - $speakers | \
|
||||
${GAWK-gawk} '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \
|
||||
if [ -n "$glmfile" ]; then
|
||||
${GAWK-gawk} '{ gsub("-","_",$1); gsub("-","_",$3); print }'
|
||||
else
|
||||
cat
|
||||
fi > $sortedrefs
|
||||
;;
|
||||
*) sort "$refs" | join - $sentids | \
|
||||
${GAWK-gawk} '{ if (multiwords) for (i = 2; i <= NF; i++) \
|
||||
gsub("_", " ", $i); print }'\
|
||||
multiwords=$multiwords | \
|
||||
sed -e 's,\[[^]]*\],,g' | \
|
||||
sentid-to-sclite format_sentids=$format_sentids | \
|
||||
if [ -n "$glmfile" ]; then
|
||||
csrfilt.sh $filter_options -i trn -t hyp -dh $glmfile
|
||||
else
|
||||
cat
|
||||
fi > $sortedrefs
|
||||
|
||||
# find segments to ignore
|
||||
${GAWK-gawk} 'NF == 2 && tolower($2) == "ignore_time_segment_in_scoring" \
|
||||
{ print $1 }' < $refs | \
|
||||
sort > $ignorehyps
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ ! -s $sortedrefs ]; then
|
||||
echo "Filtered references are empty" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#
|
||||
# sort and condition hyps
|
||||
#
|
||||
case "$refs" in
|
||||
*.stm|*.stm.filt) # NIST scoring
|
||||
# sclite will handle ignored segments
|
||||
case "$hyps" in
|
||||
*.ctm)
|
||||
cat $hyps | ${GAWK-gawk} '!/^;;/ { print tolower($1 "_" $2), $0 }' | \
|
||||
sort -b -k 1,1 -k 2,2 -k 3,3 -k 4,4n | join - $speakers | \
|
||||
${GAWK-gawk} '{ $1 = ""; print }' ;;
|
||||
*) sort -k 1,1 $hyps | join - $sentids | sentid-to-ctm ;;
|
||||
esac | \
|
||||
${GAWK-gawk} '{ # handle new-style CTM format (convert it to old format)
|
||||
if (NF >= 7) {
|
||||
if ($7 != "lex") next;
|
||||
else $7 = $8 = "";
|
||||
}
|
||||
if (remove_periods) gsub("[.]", "", $5);
|
||||
print;
|
||||
}' remove_periods=$remove_periods | \
|
||||
if [ -n "$glmfile" ]; then
|
||||
${GAWK-gawk} '{ gsub("-","_",$1); print }' | \
|
||||
csrfilt.sh $filter_options -i ctm -t hyp -dh $glmfile | \
|
||||
if [ -n "$remove_hesitations" ]; then
|
||||
grep -vi '%HESITATION'
|
||||
else
|
||||
cat
|
||||
fi
|
||||
else
|
||||
cat
|
||||
fi > $sortedhyps
|
||||
;;
|
||||
*) # we have to remove ignored segments ourselves
|
||||
sort -k 1,1 $hyps | join - $sentids | join -v 1 - $ignorehyps | \
|
||||
${GAWK-gawk} '{ if (multiwords) for (i = 2; i <= NF; i++) gsub("_", " ", $i);
|
||||
if (remove_periods) for (i = 2; i <= NF; i++) gsub("[.]", "", $i);
|
||||
print }'\
|
||||
remove_periods=$remove_periods multiwords=$multiwords | \
|
||||
sed -e 's,\[[^]]*\],,g' \
|
||||
-e "s,$reject,,g" \
|
||||
-e 's,-pau-,,g' | \
|
||||
if [ -z "$keep_bracketed" ]; then
|
||||
sed -e 's,<[^>]*>,,g'
|
||||
else
|
||||
cat
|
||||
fi |\
|
||||
sentid-to-sclite format_sentids=$format_sentids |\
|
||||
if [ -n "$glmfile" ]; then
|
||||
csrfilt.sh $filter_options -i trn -t hyp -dh $glmfile | \
|
||||
if [ -n "$remove_hesitations" ]; then
|
||||
sed -e 's/\%HESITATION//g' -e 's/\%hesitation//g'
|
||||
else
|
||||
cat
|
||||
fi
|
||||
else
|
||||
cat
|
||||
fi > $sortedhyps
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ ! -s $sortedhyps ]; then
|
||||
echo "Filtered hypotheses are empty" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
[ "$verbose" ] && set -x
|
||||
|
||||
case $sclite in
|
||||
sclite) options="-n $name $options" ;;
|
||||
esac
|
||||
|
||||
case "$refs" in
|
||||
*.stm|*.stm.filt) # NIST scoring
|
||||
$sclite -f 0 -O . \
|
||||
-h $sortedhyps ctm $name -r $sortedrefs stm \
|
||||
-D $options
|
||||
;;
|
||||
*) $sclite -f 0 -O . \
|
||||
-h $sortedhyps trn $name -r $sortedrefs trn \
|
||||
-i swb $options
|
||||
;;
|
||||
esac
|
||||
|
||||
153
language_model/srilm-1.7.3/utils/src/compute-sclite-nbest
Executable file
153
language_model/srilm-1.7.3/utils/src/compute-sclite-nbest
Executable file
@@ -0,0 +1,153 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# compute-sclite-nbest --
|
||||
# Compute errors for nbest hypotheses using sclite
|
||||
# for use with nbest-optimize -errors option
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/compute-sclite-nbest,v 1.5 2016/09/23 20:05:51 stolcke Exp $
|
||||
#
|
||||
|
||||
usage () {
|
||||
echo "$0 nbest-files output-dir -r refs [-filter script] [sclite-options]"
|
||||
}
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
usage;
|
||||
exit 2
|
||||
fi
|
||||
|
||||
filter=cat
|
||||
nbest_files=$1
|
||||
output_dir=$2
|
||||
shift; shift
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case "$1" in
|
||||
-r) refs=$2
|
||||
shift; shift
|
||||
;;
|
||||
-filter) filter="$2"
|
||||
shift; shift
|
||||
;;
|
||||
*) sclite_options="$sclite_options $1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$refs" ]; then
|
||||
usage
|
||||
exit 2
|
||||
fi
|
||||
|
||||
TMPDIR=${TMPDIR-/tmp}
|
||||
|
||||
sortedrefs=$TMPDIR/sortedrefs.$$
|
||||
nbestrefs=$TMPDIR/nbestrefs.$$
|
||||
nbesthyps=$TMPDIR/nbesthyps.$$
|
||||
scliteout=$TMPDIR/scliteout.$$
|
||||
|
||||
trap "/bin/rm -f $sortedrefs $nbestrefs $nbesthyps $scliteout; exit 1" 1 2 13 15
|
||||
|
||||
set -e
|
||||
|
||||
sort -k 1,1 $refs > $sortedrefs
|
||||
|
||||
> $nbestrefs
|
||||
> $nbesthyps
|
||||
|
||||
#
|
||||
# Prepare hyp and reference files
|
||||
#
|
||||
cat $nbest_files | \
|
||||
sed 's,.*/\(.*\).gz$,\1 &,' | \
|
||||
sort -k 1,1 | \
|
||||
join - $sortedrefs | \
|
||||
while read sentid nbestlist refwords
|
||||
do
|
||||
if [ -z "$refwords" ]; then
|
||||
echo "warning: $sentid has no reference" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
echo $sentid >&2
|
||||
|
||||
gunzip -cf $nbestlist | \
|
||||
nbest-words | \
|
||||
$filter | \
|
||||
${GAWK-gawk} \
|
||||
-v nbestrefs=$nbestrefs -v nbesthyps=$nbesthyps \
|
||||
-v outdir=$output_dir \
|
||||
-v sentid=$sentid -v refwords="$refwords" '{
|
||||
if (refwords == "ignore_time_segment_in_scoring") {
|
||||
# this utterance is to be ignored --
|
||||
# we generate dummy error information directly
|
||||
# nbest-optimize(1) error count format is: wcr wer nsub ndel nins nerr nw
|
||||
print 0, 0, 0, 0, 0, 0, 0 | "gzip > " outdir "/" sentid ".gz";
|
||||
} else {
|
||||
gsub("<[^ ]*>", "");
|
||||
gsub("-pau-", "");
|
||||
|
||||
hypid = sprintf("%s#%05d", sentid, NR);
|
||||
print hypid, refwords >> nbestrefs;
|
||||
print hypid, $0 >> nbesthyps;
|
||||
}
|
||||
}'
|
||||
done
|
||||
|
||||
if [ -s $nbestrefs ]; then
|
||||
#
|
||||
# Run the scoring
|
||||
#
|
||||
(set -x; compute-sclite \
|
||||
-raw-sentids \
|
||||
$sclite_options \
|
||||
-O $TMPDIR -l 1000 \
|
||||
-r $nbestrefs \
|
||||
-h $nbesthyps \
|
||||
-o pralign )
|
||||
|
||||
#
|
||||
# Extract error counts from sclite pra output
|
||||
#
|
||||
${GAWK-gawk} -v outdir=$output_dir '
|
||||
$1 == "id:" {
|
||||
sentid = $2;
|
||||
sub("^\\(", "", sentid);
|
||||
# strip the hyp number
|
||||
sub("#[0-9]*)$", "", sentid);
|
||||
|
||||
# sclite lowercases sentids
|
||||
# Heuristically restore channel letters to uppercase
|
||||
sub("_a_", "_A_", sentid);
|
||||
sub("_b_", "_B_", sentid);
|
||||
sub("-a-", "-A-", sentid);
|
||||
sub("-b-", "-B-", sentid);
|
||||
|
||||
if (sentid != last_sentid) {
|
||||
if (outfile) close(outfile);
|
||||
outfile = "gzip > " outdir "/" sentid ".gz"
|
||||
last_sentid = sentid;
|
||||
}
|
||||
|
||||
next;
|
||||
}
|
||||
$1 == "Scores:" {
|
||||
corr = $6;
|
||||
subs = $7;
|
||||
dels = $8;
|
||||
inss = $9;
|
||||
|
||||
words = corr + subs + dels;
|
||||
errs = subs + dels + inss;
|
||||
wer = words > 0 ? errs/words : 0;
|
||||
# nbest-optimize(1) error count format is: wcr wer nsub ndel nins nerr nw
|
||||
print words-dels-subs, wer, subs, dels, inss, errs, words | outfile;
|
||||
}
|
||||
END {
|
||||
if (outfile) close(outfile);
|
||||
}' $nbesthyps.pra
|
||||
fi
|
||||
|
||||
/bin/rm -f $sortedrefs $nbestrefs $nbesthyps $nbesthyps.pra
|
||||
152
language_model/srilm-1.7.3/utils/src/concat-sausages.gawk
Executable file
152
language_model/srilm-1.7.3/utils/src/concat-sausages.gawk
Executable file
@@ -0,0 +1,152 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# concat-sausages --
|
||||
# concatenate a list of sausages into a single word confusion networks
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/concat-sausages.gawk,v 1.1 2019/02/09 07:34:35 stolcke Exp $
|
||||
#
|
||||
# input format:
|
||||
#
|
||||
# name Speech012_apple-iphone-6s-agc_00001330_00010030
|
||||
# numaligns 32
|
||||
# posterior 1
|
||||
# align 0 <s> 1
|
||||
# info 0 <s> 1.33 0.06 0 0 : :
|
||||
# align 1 OK 1
|
||||
# info 1 OK 1.39 0.5 0 0 : :
|
||||
# align 2 *DELETE* 1 I 3.110077054250103e-33 we 3.193624897980025e-52 i 7.615703946522299e-53
|
||||
# info 2 I 1.83 0.06 0 0 : :
|
||||
# info 2 we 1.85 0.06 0 0 : :
|
||||
# info 2 i 1.83 0.06 0 0 : :
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
name = "";
|
||||
numaligns = 0;
|
||||
posterior = 0;
|
||||
if (posterior_factor == "") {
|
||||
posterior_factor = 1;
|
||||
}
|
||||
|
||||
sent_start = "<s>";
|
||||
sent_end = "</s>";
|
||||
|
||||
epsilon = 1e-05;
|
||||
}
|
||||
|
||||
function abs(x) {
|
||||
return x < 0 ? -x : x;
|
||||
}
|
||||
|
||||
function process_sausage(file, remove_start, remove_end) {
|
||||
|
||||
if (file ~ /.*\.gz$|.*\.Z/) {
|
||||
input = "exec gunzip -c " file;
|
||||
} else {
|
||||
input = "exec cat " file;
|
||||
}
|
||||
|
||||
while ((status = (input | getline)) > 0) {
|
||||
|
||||
if ($1 == "name") {
|
||||
if (output_name != "") {
|
||||
name = output_name;
|
||||
} else if (name == "") {
|
||||
name = $2;
|
||||
} else {
|
||||
name = name "+" $2
|
||||
}
|
||||
|
||||
} else if ($1 == "posterior") {
|
||||
if (posterior != 0 && abs($2 - posterior) > epsilon) {
|
||||
print file ": incompatible posterior: " $2 > "/dev/stderr"
|
||||
exit(1);
|
||||
} else {
|
||||
posterior = $2;
|
||||
# if (posterior_factor != 1) {
|
||||
# posterior *= posterior_factor;
|
||||
# }
|
||||
}
|
||||
} else if ($1 == "numaligns") {
|
||||
# offset for renumbered alignments
|
||||
start_alignment = numaligns;
|
||||
} else if ($1 == "align") {
|
||||
|
||||
$2 = $2 + start_alignment;
|
||||
|
||||
if (posterior_factor != 1 && $3 != sent_start && $3 != sent_end) {
|
||||
for (i = 4; i <= NF; i += 2) {
|
||||
$i *= posterior_factor;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# remove alignment positions that are just for
|
||||
# start/end sentence tags, if so desired
|
||||
#
|
||||
if (NF == 4 && $3 == sent_start && remove_start) {
|
||||
start_alignment --;
|
||||
;
|
||||
} else if (NF == 4 && $3 == sent_end && remove_end) {
|
||||
start_alignment --;
|
||||
;
|
||||
} else {
|
||||
alignments[$2] = $0;
|
||||
|
||||
if ($2 + 1 > numaligns) {
|
||||
numaligns = $2 + 1;
|
||||
}
|
||||
}
|
||||
} else if ($1 == "info") {
|
||||
|
||||
$2 = $2 + start_alignment;
|
||||
|
||||
if (!($2 in info)) {
|
||||
info[$2] = $0;
|
||||
} else {
|
||||
info[$2] = info[$2] "\n" $0;
|
||||
}
|
||||
} else if ($1 == "time") {
|
||||
; # ignore
|
||||
} else {
|
||||
print file ": unknown keyword: " $1 > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (status < 0) {
|
||||
print "error opening " file >> "/dev/stderr";
|
||||
}
|
||||
|
||||
close(input);
|
||||
}
|
||||
|
||||
function output_sausage() {
|
||||
print "name", name;
|
||||
print "numaligns", numaligns;
|
||||
print "posterior", posterior;
|
||||
|
||||
for (i = 0; i < numaligns; i ++) {
|
||||
if (i in alignments) {
|
||||
print alignments[i];
|
||||
if (i in info) {
|
||||
print info[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
if (ARGC < 2) {
|
||||
print "usage: " ARGV[0] " SAUSAGE1 SAUSAGE2 ..." \
|
||||
>> "/dev/stderr";
|
||||
exit(2);
|
||||
}
|
||||
|
||||
for (arg = 1; arg < ARGC; arg ++) {
|
||||
process_sausage(ARGV[arg], arg > 1, arg < ARGC-1);
|
||||
}
|
||||
|
||||
output_sausage();
|
||||
}
|
||||
|
||||
13
language_model/srilm-1.7.3/utils/src/context-ngrams.gawk
Executable file
13
language_model/srilm-1.7.3/utils/src/context-ngrams.gawk
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# context-ngrams --
|
||||
# Extract counts corresponding to ngram contexts
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/context-ngrams.gawk,v 1.1 2008/09/30 03:54:05 stolcke Exp $
|
||||
#
|
||||
|
||||
NF > 2 {
|
||||
$(NF-1) = "";
|
||||
print $0;
|
||||
}
|
||||
|
||||
35
language_model/srilm-1.7.3/utils/src/continuous-ngram-count.gawk
Executable file
35
language_model/srilm-1.7.3/utils/src/continuous-ngram-count.gawk
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# continuous-ngram-count --
|
||||
# Generate ngram counts ignoring line breaks
|
||||
#
|
||||
# usage: continous-ngram-count order=ORDER textfile | ngram-count -read -
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/continuous-ngram-count.gawk,v 1.1 1998/08/24 00:52:30 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
order = 3;
|
||||
|
||||
head = 0; # next position in ring buffer
|
||||
}
|
||||
|
||||
function process_word(w) {
|
||||
buffer[head] = w;
|
||||
|
||||
ngram = "";
|
||||
for (j = 0; j < order; j ++) {
|
||||
w1 = buffer[(head + order - j) % order];
|
||||
if (w1 == "") {
|
||||
break;
|
||||
}
|
||||
ngram = w1 " " ngram;
|
||||
print ngram 1;
|
||||
}
|
||||
head = (head + 1) % order;
|
||||
}
|
||||
|
||||
{
|
||||
for (i = 1; i <= NF; i ++) {
|
||||
process_word($i);
|
||||
}
|
||||
}
|
||||
80
language_model/srilm-1.7.3/utils/src/cumbin
Executable file
80
language_model/srilm-1.7.3/utils/src/cumbin
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# This tool calculates probability over the tail of a binomial
|
||||
# distribution. The calculations is done directly, without using any
|
||||
# approximations.
|
||||
#
|
||||
# This program is in the public domain. It was written
|
||||
# by Brett Kessler and David Gelbart.
|
||||
#
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use POSIX;
|
||||
|
||||
if (@ARGV != 2 && @ARGV != 3) {
|
||||
die "Usage: $0 n k [p]\n";
|
||||
}
|
||||
my $n = $ARGV[0];
|
||||
my $k = $ARGV[1];
|
||||
my $p = $ARGV[2];
|
||||
|
||||
if (!(defined $p)) {
|
||||
$p = 0.5;
|
||||
}
|
||||
|
||||
if (($n - $k) > $k) {
|
||||
die "Did you choose the right value of k?\n";
|
||||
}
|
||||
my $P = tailBinomial($n, $k, $p);
|
||||
print "One-tailed: P(k >= ${k} | n=${n}, p=${p}) = ${P}\n";
|
||||
$P = 2 * $P;
|
||||
print "Two-tailed: 2*P(k >= ${k} | n=${n}, p=${p}) = ${P}\n";
|
||||
|
||||
# Calculate the sum over the tail of the binomial probability distribution.
|
||||
sub tailBinomial {
|
||||
my($N, $k, $p) = @_;
|
||||
|
||||
my $sum = 0;
|
||||
for (my $i = $k; $i <= $N; $i++) {
|
||||
$sum += exp(logBinomial($N, $i, $p));
|
||||
}
|
||||
$sum;
|
||||
}
|
||||
|
||||
# We use logarithms during calculation to avoid overflow during the
|
||||
# calculation of factorials and underflow during the calculation of
|
||||
# powers of probabilities. This function calculates the log of
|
||||
# binomial probability for given N, k, p.
|
||||
sub logBinomial {
|
||||
my($N, $k, $p) = @_;
|
||||
my $q = 1 - $p;
|
||||
|
||||
# These safety checks were inspired by the code at
|
||||
# http://faculty.vassar.edu/lowry/binomialX.html
|
||||
die "Error: N not integer" if ($N != floor($N));
|
||||
die "Error: k not integer" if ($k != floor($k));
|
||||
die "Error: k > N" if ($k > $N);
|
||||
die "Error: p > 1" if ($p > 1);
|
||||
die "Error: N < 1" if ($N < 1);
|
||||
|
||||
logBinomCoeff($N, $k) + $k * log($p) + ($N - $k) * log($q);
|
||||
}
|
||||
|
||||
# Calculcate the log of the binomial coefficient for given N and k.
|
||||
sub logBinomCoeff {
|
||||
my($N, $k) = @_;
|
||||
logFactorial($N) - logFactorial($k) - logFactorial($N - $k);
|
||||
}
|
||||
|
||||
# Calculate the log of the factorial of the argument.
|
||||
sub logFactorial {
|
||||
my($N) = @_;
|
||||
|
||||
my $prod = 0;
|
||||
for (my $i = 2; $i <= $N; $i++) {
|
||||
$prod += log($i);
|
||||
}
|
||||
|
||||
$prod;
|
||||
}
|
||||
89
language_model/srilm-1.7.3/utils/src/de-vq-lm.gawk
Executable file
89
language_model/srilm-1.7.3/utils/src/de-vq-lm.gawk
Executable file
@@ -0,0 +1,89 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# de-vq-lm --
|
||||
# Expand parameters in a quantized ARPA backoff LM
|
||||
#
|
||||
# usage: de-vq-lm bins=CW lm-file > sub-lm-file
|
||||
#
|
||||
# where CW defines the quantization bins.
|
||||
#
|
||||
# Copyright (c) 2012 Andreas Stolcke, Microsoft Corp. All Rights Reserved.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/de-vq-lm.gawk,v 1.2 2019/09/09 23:13:15 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
bins = "/dev/null";
|
||||
}
|
||||
|
||||
# read the cw file
|
||||
#
|
||||
#VQSize 256
|
||||
#Codeword Mean Count
|
||||
# 0 -12.7330028909195 10454
|
||||
# 1 -12.3314038288506 1494
|
||||
# etc.
|
||||
#
|
||||
NR == 1 {
|
||||
saveline = $0;
|
||||
|
||||
getline < bins;
|
||||
if ($1 != "VQSize") {
|
||||
print "file " bins " is not a VQ file" > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
vqsize = $2;
|
||||
|
||||
getline < bins;
|
||||
if ($1 != "Codeword") {
|
||||
print "file " bins " is not a VQ file" > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
while ((getline < bins) > 0) {
|
||||
vqbin[$1] = $2;
|
||||
}
|
||||
close(bins);
|
||||
|
||||
$0 = saveline;
|
||||
}
|
||||
|
||||
NF==0 {
|
||||
print; next;
|
||||
}
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
print; next;
|
||||
}
|
||||
/^\\[0-9]-grams:/ {
|
||||
currorder=substr($0,2,1);
|
||||
print; next;
|
||||
}
|
||||
/^\\/ {
|
||||
print; next;
|
||||
}
|
||||
|
||||
#
|
||||
# replace VQ index with value in ngram parameter lines
|
||||
#
|
||||
currorder {
|
||||
if (!($1 in vqbin)) {
|
||||
print "line: " NR ": VQ bin #" $1 "is undefined" > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
$1 = vqbin[$1];
|
||||
|
||||
# backoff weight, if any
|
||||
if (NF == currorder + 2) {
|
||||
if (!($NF in vqbin)) {
|
||||
print "line: " NR ": VQ bin #" $NF "is undefined" > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
$NF = vqbin[$NF];
|
||||
}
|
||||
|
||||
print; next;
|
||||
}
|
||||
|
||||
# pass through anything else
|
||||
{ print }
|
||||
79
language_model/srilm-1.7.3/utils/src/empty-sentence-lm
Executable file
79
language_model/srilm-1.7.3/utils/src/empty-sentence-lm
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# empty-sentence-lm --
|
||||
# modify language model to allow the empty sentence.
|
||||
# This adds a "<s> </s>" bigram to the model and scales the
|
||||
# probabilities of other bigrams starting with <s>.
|
||||
# probabilities. Backoff weights are recomputed.
|
||||
#
|
||||
# usage: empty-sentence-lm -prob P -lm oldlm -write-lm newlm
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/empty-sentence-lm,v 1.5 2013/03/09 07:13:01 stolcke Exp $
|
||||
#
|
||||
|
||||
oldlm=-
|
||||
newlm=-
|
||||
prob=0.1
|
||||
vocab=/dev/null
|
||||
norm_option=-renorm
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-prob) prob="$2" ; shift ;;
|
||||
-lm) oldlm="$2" ; shift ;;
|
||||
-write-lm) newlm="$2" ; shift ;;
|
||||
-nonorm) norm_option= ; shift ;;
|
||||
*) options="$options $1" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
gzip -dcf $oldlm | ${GAWK-gawk} '
|
||||
function log10(x) {
|
||||
return log(x)/2.30258509299404568402;
|
||||
}
|
||||
|
||||
/^ngram 2=/ {
|
||||
num = substr($2, 3);
|
||||
print "ngram 2=" num + 1;
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# add empty-sentence bigram
|
||||
#
|
||||
/^\\2-grams:/ {
|
||||
print;
|
||||
print log10(prob), "<s> </s>";
|
||||
in_ngrams = 2;
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# ensure that <s> has backoff weight and
|
||||
# approximately adjust it (correct adjustment done by ngram -renorm)
|
||||
#
|
||||
in_ngrams == 1 && $2 == "<s>" {
|
||||
$3 += log10(1-prob);
|
||||
}
|
||||
|
||||
#
|
||||
# scale bigram probs starting with <s>
|
||||
#
|
||||
in_ngrams == 2 && $2 == "<s>" {
|
||||
$1 += log10(1-prob);
|
||||
}
|
||||
|
||||
/^\\1-grams:/ {
|
||||
in_ngrams = 1;
|
||||
}
|
||||
|
||||
/^\\3-grams:/ {
|
||||
in_ngrams = 3;
|
||||
}
|
||||
|
||||
{
|
||||
print;
|
||||
}' prob=$prob | \
|
||||
ngram -lm - $norm_option -write-lm "$newlm" $options
|
||||
|
||||
17
language_model/srilm-1.7.3/utils/src/extract-skip-probs.gawk
Executable file
17
language_model/srilm-1.7.3/utils/src/extract-skip-probs.gawk
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# extract-skip-probs --
|
||||
# Extract the skip probabilities from a Skip-Ngram model
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/extract-skip-probs.gawk,v 1.1 1996/05/20 21:22:09 stolcke Exp $
|
||||
#
|
||||
NF == 0 {
|
||||
next;
|
||||
}
|
||||
/\\end\\/ {
|
||||
end_seen = 1;
|
||||
next;
|
||||
}
|
||||
end_seen {
|
||||
printf "%s %f\n", $1, $2;
|
||||
}
|
||||
44
language_model/srilm-1.7.3/utils/src/filter-event-counts.gawk
Executable file
44
language_model/srilm-1.7.3/utils/src/filter-event-counts.gawk
Executable file
@@ -0,0 +1,44 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# filter-event-counts --
|
||||
# Remove from a count file all ngrams that don't correspond to an "event"
|
||||
# for the LM, such that
|
||||
#
|
||||
# ngram -order N -lm LM -ppl TEXT
|
||||
# and
|
||||
# ngram-count -order N -text TEXT -write - | \
|
||||
# filter-event-counts order=N | \
|
||||
# ngram -order N -lm LM -counts -
|
||||
#
|
||||
# yield the same result.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/filter-event-counts.gawk,v 1.2 2009/09/25 00:06:50 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
order = 3;
|
||||
escape = "";
|
||||
|
||||
sent_start = "<s>";
|
||||
}
|
||||
|
||||
# pass escaped lines through
|
||||
escape != "" && substr($0, 1, length(escape)) == escape {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
# Start-of-sentence ngrams are always included (except for <s> unigram)
|
||||
$1 == sent_start {
|
||||
if (NF == 2) {
|
||||
next;
|
||||
} else {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
}
|
||||
|
||||
# ngrams of highest order
|
||||
NF == order + 1 {
|
||||
print;
|
||||
}
|
||||
|
||||
89
language_model/srilm-1.7.3/utils/src/find-reference-posteriors.gawk
Executable file
89
language_model/srilm-1.7.3/utils/src/find-reference-posteriors.gawk
Executable file
@@ -0,0 +1,89 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# find-reference-posteriors --
|
||||
# tabular the sausage posteriors of reference words
|
||||
#
|
||||
# usage: find-reference-posteriors posteriors_files=NBEST_POSTERIORS SAUSAGE
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/find-reference-posteriors.gawk,v 1.4 2010/08/20 00:17:18 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
sentid = "UNKNOWN";
|
||||
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
logINF = -320;
|
||||
}
|
||||
|
||||
function log10(x) {
|
||||
return log(x) / M_LN10;
|
||||
}
|
||||
function exp10(x) {
|
||||
if (x < logINF) {
|
||||
return 0;
|
||||
} else {
|
||||
return exp(x * M_LN10);
|
||||
}
|
||||
}
|
||||
function addlogs(x,y) {
|
||||
if (x<y) {
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
return x + log10(1 + exp10(y - x));
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
if (posteriors_file) {
|
||||
hypno = 0;
|
||||
num_sources = 0;
|
||||
while ((("gzip -dcf " posteriors_file) | getline pline) > 0) {
|
||||
if (split(pline, a) == 3) {
|
||||
hyp_source[hypno] = a[1];
|
||||
if (a[1] > num_sources) {
|
||||
num_sources = a[1];
|
||||
}
|
||||
hyp_posterior[hypno] = a[3];
|
||||
hypno ++;
|
||||
}
|
||||
}
|
||||
print "read " hypno " posteriors from " num_sources " sources" \
|
||||
>> "/dev/stderr";
|
||||
}
|
||||
}
|
||||
|
||||
# input format:
|
||||
# align 1 hello 0.988212 below 0.00481234 low 0.00331215 ...
|
||||
# reference 1 hello
|
||||
# hyps 1 hello 0 1 2 3 4 5 6 7 8 9 10 11 16 17 18 19
|
||||
|
||||
$1 == "align" {
|
||||
position = $2;
|
||||
|
||||
delete word_posteriors;
|
||||
for (i = 3; i <= NF; i +=2 ) {
|
||||
word_posteriors[$i] = $(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
$1 == "reference" && $2 == position {
|
||||
refword = $3;
|
||||
}
|
||||
|
||||
$1 == "hyps" && $2 == position && $3 == refword {
|
||||
for (i = 1; i <= num_sources; i ++) {
|
||||
posterior_sum[i] = logINF;
|
||||
}
|
||||
for (i = 4; i <= NF; i ++) {
|
||||
posterior_sum[hyp_source[$i]] = \
|
||||
addlogs(posterior_sum[hyp_source[$i]], hyp_posterior[$i]);
|
||||
}
|
||||
|
||||
printf "%s %d %s %g", sentid, position, refword, \
|
||||
word_posteriors[refword];
|
||||
|
||||
for (i = 1; i <= num_sources; i ++) {
|
||||
printf " %g", exp10(posterior_sum[i]);
|
||||
}
|
||||
printf "\n";
|
||||
}
|
||||
|
||||
153
language_model/srilm-1.7.3/utils/src/fix-ctm.gawk
Executable file
153
language_model/srilm-1.7.3/utils/src/fix-ctm.gawk
Executable file
@@ -0,0 +1,153 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# Post-process CTM files output by lattice-tool -output-ctm to
|
||||
# use global conversation-relative time marks and channel ids.
|
||||
# (This requires that the waveform names conform to our standard
|
||||
# formats, the same as in sentid-to-ctm.)
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/fix-ctm.gawk,v 1.10 2019/02/09 07:30:11 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
# time to add to word start times (should be about half FE window size)
|
||||
phase_shift = 0.01;
|
||||
|
||||
tag_pat = "^<.*>$";
|
||||
htk_tag_pat = "^null|^!sent_start|^!sent_end";
|
||||
noise_pat = "^\\[.*\\]$";
|
||||
fragment_pat = "-$";
|
||||
pause = "-pau-";
|
||||
|
||||
channel_letters = 0;
|
||||
|
||||
# hesitations (best deleted for NIST scoring;
|
||||
# should be kept in sync with GLM filter file)
|
||||
hesitation["uh"] = 1;
|
||||
hesitation["um"] = 1;
|
||||
hesitation["eh"] = 1;
|
||||
hesitation["mm"] = 1;
|
||||
hesitation["hm"] = 1;
|
||||
hesitation["ah"] = 1;
|
||||
hesitation["huh"] = 1;
|
||||
hesitation["ha"] = 1;
|
||||
hesitation["er"] = 1;
|
||||
hesitation["oof"] = 1;
|
||||
hesitation["hee"] = 1;
|
||||
hesitation["ach"] = 1;
|
||||
hesitation["eee"] = 1;
|
||||
hesitation["ew"] = 1;
|
||||
|
||||
parse_sentids = 1;
|
||||
|
||||
orig_times = 0; # DON'T preserve original times
|
||||
|
||||
sort_cmd = "sort -b -k 1,1 -k 2,2 -k 3,3n";
|
||||
}
|
||||
{
|
||||
sentid = $1;
|
||||
start_time = $3;
|
||||
duration = $4;
|
||||
word = $5;
|
||||
confidence = $6;
|
||||
|
||||
# HTK stuff: strip quotes
|
||||
sub("\"", "", sentid);
|
||||
sub("\"", "", sentid);
|
||||
# archive aliasing info
|
||||
sub("=.*\\[.*\\]$", "", sentid);
|
||||
# standard input file suffixes.
|
||||
sub("\\.plp$", "", sentid);
|
||||
sub("\\.wav$", "", sentid);
|
||||
sub("\\.sph$", "", sentid);
|
||||
|
||||
if (sentid == last_sentid && start_time == "?") {
|
||||
start_time = last_end_time;
|
||||
duration = 0;
|
||||
}
|
||||
|
||||
# exclude sentence start/end tags
|
||||
if (word ~ tag_pat) next;
|
||||
if (tolower(word) ~ htk_tag_pat) next;
|
||||
|
||||
if (sentid == last_sentid) {
|
||||
if (start_time <= last_start_time) {
|
||||
new_start_time = last_start_time + .01;
|
||||
|
||||
print "warning: " sentid ": word \"" word "\" start time " start_time " " \
|
||||
(start_time < last_start_time ? "is less than" : "equals") \
|
||||
" previous word -- adjusting to " new_start_time > "/dev/stderr";
|
||||
|
||||
start_time = new_start_time;
|
||||
}
|
||||
}
|
||||
|
||||
if (!parse_sentids) {
|
||||
conv = sentid;
|
||||
channel = $2;
|
||||
start_offset = 0;
|
||||
} else if (match(sentid, "_[0-9]_[-0-9][0-9]*_[0-9][0-9]*$")) {
|
||||
# waveforms with [012] channel id, timemarks 1/1000s
|
||||
# NOTE: this form is used by the segmenter
|
||||
conv = substr(sentid, 1, RSTART-1);
|
||||
split(substr(sentid, RSTART+1), sentid_parts, "_");
|
||||
channel = sentid_parts[1];
|
||||
start_offset = sentid_parts[2] / 1000;
|
||||
end_offset = sentid_parts[3] / 1000;
|
||||
} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*$")) {
|
||||
conv = substr(sentid, 1, RSTART-1);
|
||||
split(substr(sentid, RSTART+1), sentid_parts, "_");
|
||||
channel = sentid_parts[1];
|
||||
start_offset = sentid_parts[2] / 100;
|
||||
end_offset = sentid_parts[3] / 100;
|
||||
} else {
|
||||
print "cannot parse sentid " sentid >> "/dev/stderr";
|
||||
conv = sentid;
|
||||
channel = 1;
|
||||
start_offset = 0;
|
||||
end_offset = 10000;
|
||||
}
|
||||
|
||||
if (orig_times) {
|
||||
start_offset = 0;
|
||||
}
|
||||
|
||||
if (channel_letters && channel ~ /^[0-9]/) {
|
||||
channel = sprintf("%c", 64+channel);
|
||||
}
|
||||
|
||||
speaker_id = conv "_" channel;
|
||||
|
||||
ncomps = split(word, word_comps, "_");
|
||||
|
||||
for (j = 1; j <= ncomps; j ++) {
|
||||
this_word = word_comps[j];
|
||||
|
||||
if (this_word == pause) {
|
||||
next;
|
||||
} else if (this_word in hesitation) {
|
||||
word_type = "fp";
|
||||
} else if (this_word ~ fragment_pat) {
|
||||
word_type = "frag";
|
||||
} else if (this_word ~ noise_pat) {
|
||||
word_type = "non-lex";
|
||||
} else {
|
||||
word_type = "lex";
|
||||
}
|
||||
|
||||
printf "%s %s %.2f %.2f %s %g %s %s\n", \
|
||||
conv, channel, \
|
||||
start_offset + start_time + phase_shift + \
|
||||
(j - 1) * duration/ncomps,\
|
||||
duration/ncomps, \
|
||||
this_word, \
|
||||
confidence, \
|
||||
word_type, \
|
||||
(word_type == "non-lex" ? \
|
||||
"null" : speaker_id) \
|
||||
| sort_cmd;
|
||||
}
|
||||
|
||||
last_start_time = start_time;
|
||||
last_end_time = start_time + duration;
|
||||
last_sentid = sentid;
|
||||
}
|
||||
|
||||
158
language_model/srilm-1.7.3/utils/src/fsm-to-pfsg.gawk
Executable file
158
language_model/srilm-1.7.3/utils/src/fsm-to-pfsg.gawk
Executable file
@@ -0,0 +1,158 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# fsm-to-pfsg --
|
||||
# convert AT&T FSM acceptor to Decipher PFSG format
|
||||
#
|
||||
# usage: fsm-to-pfsg [pfsg_name=NAME] [transducer=1] [scale=S] file.fsm > file.pfsg
|
||||
# pfsg_name=NAME sets PFSG name to NAME
|
||||
# transducer=1 indicates input is a transducer
|
||||
# scale=S sets transition weight scaling factor to S
|
||||
# (default -1)
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/fsm-to-pfsg.gawk,v 1.10 2015-07-03 03:45:38 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
pfsg_name = "from_fsm";
|
||||
transducer = 0; # input is transducer
|
||||
|
||||
if ("TMPDIR" in ENVIRON) {
|
||||
tmpdir = ENVIRON["TMPDIR"];
|
||||
} else {
|
||||
tmpdir = "/tmp"
|
||||
}
|
||||
|
||||
if ("pid" in PROCINFO) {
|
||||
pid = PROCINFO["pid"];
|
||||
} else {
|
||||
getline pid < "/dev/pid";
|
||||
}
|
||||
tmpfile = tmpdir "/fsm.tmp" pid;
|
||||
|
||||
# hack to remove tmpfile when killed
|
||||
trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
|
||||
print "" | trap_cmd;
|
||||
|
||||
num_newnodes = 0;
|
||||
initial_node = -1;
|
||||
empty_output = "NULL";
|
||||
epsilon = "<eps>"; # FSM epsilon symbol
|
||||
map_epsilon = ""; # map epsilon to this symbol
|
||||
scale = -1; # scaling of transition weights
|
||||
}
|
||||
|
||||
# transition description
|
||||
NF >= 3 {
|
||||
from_node = $1;
|
||||
to_node = $2;
|
||||
|
||||
if (map_epsilon && $3 == epsilon) $3 = map_epsilon;
|
||||
|
||||
if (transducer) {
|
||||
if (map_epsilon && $4 == epsilon) $4 = map_epsilon;
|
||||
|
||||
# collapse input and output into a single symbol
|
||||
$3 = $3 ":" $4;
|
||||
$4 = "";
|
||||
}
|
||||
|
||||
output = $3;
|
||||
|
||||
if (initial_node < 0) {
|
||||
initial_node = from_node;
|
||||
}
|
||||
|
||||
|
||||
# create new node names for pairs of output,old-node
|
||||
if (!(output " " to_node in newnode_table)) {
|
||||
output_table[num_newnodes] = output;
|
||||
newnode_table[output " " to_node] = num_newnodes ++;
|
||||
|
||||
# create list of incoming outputs for each state
|
||||
insymbols[to_node] = insymbols[to_node] " " output;
|
||||
}
|
||||
|
||||
# save for re-reading
|
||||
print $0 > tmpfile;
|
||||
next;
|
||||
}
|
||||
|
||||
# final state description
|
||||
NF >= 1 {
|
||||
node = $1;
|
||||
|
||||
if (initial_node < 0) {
|
||||
initial_node = node;
|
||||
}
|
||||
|
||||
# save for re-reading
|
||||
print $0 > tmpfile;
|
||||
next;
|
||||
}
|
||||
|
||||
END {
|
||||
close(tmpfile);
|
||||
|
||||
# create initial and final nodes
|
||||
if (!(empty_output " " initial_node in newnode_table)) {
|
||||
output_table[num_newnodes] = empty_output;
|
||||
newnode_table[empty_output " " initial_node] = num_newnodes ++;
|
||||
insymbols[initial_node] = insymbols[initial_node] " " empty_output;
|
||||
}
|
||||
|
||||
initial_newnode = newnode_table[empty_output " " initial_node];
|
||||
output_table[num_newnodes] = empty_output;
|
||||
final_newnode = num_newnodes++;
|
||||
|
||||
# print PFSG header info
|
||||
print "name " pfsg_name;
|
||||
printf "nodes %d", num_newnodes;
|
||||
for (i = 0; i < num_newnodes; i ++) {
|
||||
printf " %s", output_table[i];
|
||||
}
|
||||
printf "\n";
|
||||
printf "initial %d\n", initial_newnode;
|
||||
printf "final %d\n", final_newnode;
|
||||
|
||||
# re-read FSM description, counting total number of new
|
||||
# transitions
|
||||
num_transitions = 0;
|
||||
while (getline < tmpfile) {
|
||||
from_node = $1;
|
||||
|
||||
# duplicate transition for all insymbols of from_node
|
||||
num_transitions += split(insymbols[from_node], a);
|
||||
}
|
||||
close(tmpfile);
|
||||
printf "transitions %d\n", num_transitions;
|
||||
|
||||
# re-read FSM description, outputing new transitions
|
||||
while (getline < tmpfile) {
|
||||
if (NF >= 3) {
|
||||
from_node = $1;
|
||||
to_node = $2;
|
||||
output = $3;
|
||||
cost = (NF == 3 ? 0 : $4);
|
||||
|
||||
# duplicate transition for all insymbols of from_node
|
||||
n = split(insymbols[from_node], a);
|
||||
for (i = 1; i <= n; i ++) {
|
||||
printf "%d %d %d\n", \
|
||||
newnode_table[a[i] " " from_node], \
|
||||
newnode_table[output " " to_node], \
|
||||
scale * cost;
|
||||
}
|
||||
} else {
|
||||
from_node = $1;
|
||||
cost = (NF == 1 ? 0 : $2);
|
||||
|
||||
# add final transition for all insymbols of from_node
|
||||
n = split(insymbols[from_node], a);
|
||||
for (i = 1; i <= n; i ++) {
|
||||
printf "%d %d %d\n", \
|
||||
newnode_table[a[i] " " from_node], \
|
||||
final_newnode, \
|
||||
scale * cost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
39
language_model/srilm-1.7.3/utils/src/get-gt-counts.gawk
Executable file
39
language_model/srilm-1.7.3/utils/src/get-gt-counts.gawk
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# get-gt-counts --
|
||||
# generate the counts-of-counts required for Good-Turing discounting
|
||||
# assumes the ngrams in the input contain no repetitions
|
||||
#
|
||||
# usage: get-gt-counts max=<number> out=<name> file ...
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/get-gt-counts.gawk,v 1.5 2016-01-07 17:19:21 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
max = 10
|
||||
maxorder = 9;
|
||||
}
|
||||
{
|
||||
total[NF - 1] ++;
|
||||
}
|
||||
NF > 1 && $NF <= max {
|
||||
counts[(NF - 1), $NF] ++;
|
||||
}
|
||||
END {
|
||||
for (order = 1; order <= maxorder; order++) {
|
||||
if (total[order] > 0) {
|
||||
if (out) {
|
||||
outfile = out ".gt" order "counts";
|
||||
} else {
|
||||
outfile = "/dev/stdout";
|
||||
}
|
||||
|
||||
for (i = 0; i <= max; i ++) {
|
||||
c = counts[order, i];
|
||||
print i, c ? c : "0" > outfile;
|
||||
}
|
||||
print "total", total[order] > outfile;
|
||||
|
||||
if (out) close(outfile);
|
||||
}
|
||||
}
|
||||
}
|
||||
38
language_model/srilm-1.7.3/utils/src/get-unigram-probs.gawk
Executable file
38
language_model/srilm-1.7.3/utils/src/get-unigram-probs.gawk
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# get-unigram-probs --
|
||||
# extract unigram probabilities from backoff LM file
|
||||
#
|
||||
# usage: get-unigram-probs bo-file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/get-unigram-probs.gawk,v 1.3 2018/06/28 07:45:08 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
linear = 0;
|
||||
|
||||
currorder = 0;
|
||||
logzero = -99;
|
||||
}
|
||||
|
||||
/^\\[0-9]-grams:/ {
|
||||
currorder = substr($0,2,1);
|
||||
next;
|
||||
}
|
||||
|
||||
/^\\/ {
|
||||
currorder = 0;
|
||||
next;
|
||||
}
|
||||
|
||||
currorder == 1 && NF > 0 {
|
||||
if (NF < 2) {
|
||||
print "line " NR ": missing word" > "/dev/stderr";
|
||||
} else if (linear) {
|
||||
print $2, $1 == logzero ? 0 : 10^$1;
|
||||
} else {
|
||||
print $2, $1 == logzero ? "-infinity" : $1;
|
||||
}
|
||||
next;
|
||||
}
|
||||
|
||||
79
language_model/srilm-1.7.3/utils/src/hits-from-log.gawk
Executable file
79
language_model/srilm-1.7.3/utils/src/hits-from-log.gawk
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# hits-from-log --
|
||||
# Computes n-gram hit ratios frrom the output of
|
||||
#
|
||||
# ngram -debug 2 -ppl
|
||||
#
|
||||
# This is useful if one wants to analyse predictability of certain
|
||||
# words/contexts.
|
||||
#
|
||||
# Copyright (c) 1995, SRI International. All Rights Reserved
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/hits-from-log.gawk,v 1.3 1995/10/28 03:59:31 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
}
|
||||
/6gram/ {
|
||||
words ++;
|
||||
hits[6] ++;
|
||||
next;
|
||||
}
|
||||
/5gram/ {
|
||||
words ++;
|
||||
hits[5] ++;
|
||||
next;
|
||||
}
|
||||
/4gram/ {
|
||||
words ++;
|
||||
hits[4] ++;
|
||||
next;
|
||||
}
|
||||
/3gram/ {
|
||||
words ++;
|
||||
hits[3] ++;
|
||||
next;
|
||||
}
|
||||
/3\+Tgram/ {
|
||||
words ++;
|
||||
thits[3] ++;
|
||||
next;
|
||||
}
|
||||
/2gram/ {
|
||||
words ++;
|
||||
hits[2] ++;
|
||||
next;
|
||||
}
|
||||
/2\+Tgram/ {
|
||||
words ++;
|
||||
thits[2] ++;
|
||||
next;
|
||||
}
|
||||
/1gram/ {
|
||||
words ++;
|
||||
hits[1] ++;
|
||||
next;
|
||||
}
|
||||
/1\+Tgram/ {
|
||||
words ++;
|
||||
thits[1] ++;
|
||||
next;
|
||||
}
|
||||
{
|
||||
next;
|
||||
}
|
||||
END {
|
||||
printf "%d words, hit rates:\n", words;
|
||||
for (i = 1; i <= 6; i++) {
|
||||
if (hits[i]) {
|
||||
printf "%dgrams: %d (%.1f%%) ", i, hits[i], \
|
||||
(hits[i]/words * 100);
|
||||
}
|
||||
if (thits[i]) {
|
||||
printf "%d+Tgrams: %d (%.1f%%) ", i, thits[i], \
|
||||
(thits[i]/words * 100);
|
||||
}
|
||||
}
|
||||
printf "\n";
|
||||
}
|
||||
50
language_model/srilm-1.7.3/utils/src/htklat-vocab.gawk
Executable file
50
language_model/srilm-1.7.3/utils/src/htklat-vocab.gawk
Executable file
@@ -0,0 +1,50 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# htklat-vocab --
|
||||
# extract vocabulary used in an HTK lattice
|
||||
#
|
||||
# usage: htklat-vocab HTK-LATTICE ... > VOCAB
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/htklat-vocab.gawk,v 1.3 2004/02/27 21:42:28 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
null = "!NULL";
|
||||
quotes = 0;
|
||||
}
|
||||
|
||||
{
|
||||
for (i = 1; i <= NF; i ++) {
|
||||
# skip comments
|
||||
if ($i ~ /^#/) next;
|
||||
|
||||
# Note: this doesn't handle quoted spaces
|
||||
# (as SRILM generally doesn't)
|
||||
if ($i ~ /^W=/ || $i ~ /^WORD=/) {
|
||||
word = substr($i, index($i, "=") + 1);
|
||||
|
||||
if (quotes) {
|
||||
# HTK quoting conventions
|
||||
if (word ~ /^['"]/) {
|
||||
word = substr(word, 2, length(word)-2);
|
||||
}
|
||||
if (word ~ /\\/) {
|
||||
gsub(/\\\\/, "@QuOtE@", word);
|
||||
gsub(/\\/, "", word);
|
||||
gsub(/@QuOtE@/, "\\", word);
|
||||
}
|
||||
}
|
||||
|
||||
if (word != null) {
|
||||
is_word[word] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
for (word in is_word) {
|
||||
print word;
|
||||
}
|
||||
}
|
||||
|
||||
14
language_model/srilm-1.7.3/utils/src/isclassname.gawk
Normal file
14
language_model/srilm-1.7.3/utils/src/isclassname.gawk
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# Test for classname heuristic used in add-pauses-to-pfsg.gawk
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/isclassname.gawk,v 1.1 2007/10/19 04:16:25 stolcke Exp $
|
||||
#
|
||||
|
||||
function is_classname(w) {
|
||||
return w ~ /^\*.*\*$/ || !(w ~ /[[:lower:]]/ || w ~ /[^\x00-\x7F]/);
|
||||
}
|
||||
|
||||
{
|
||||
print $1 " is " (!is_classname($1) ? "not " : "") "a class name";
|
||||
}
|
||||
31
language_model/srilm-1.7.3/utils/src/log10-to-bytelog.gawk
Executable file
31
language_model/srilm-1.7.3/utils/src/log10-to-bytelog.gawk
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# log10-to-bytelog --
|
||||
# convert log-base-10 scores to bytelog
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/log10-to-bytelog.gawk,v 1.1 1997/04/22 20:20:41 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
logscale = 2.30258509299404568402 * 10000.5 / 1024.0;
|
||||
scale = 1;
|
||||
round = 0.5;
|
||||
}
|
||||
function rint(x) {
|
||||
if (x < 0) {
|
||||
return int(x - round);
|
||||
} else {
|
||||
return int(x + round);
|
||||
}
|
||||
}
|
||||
{
|
||||
for (i = 1; i <= NF; i ++) {
|
||||
if ($i ~ /^[-+.0-9][.0-9]*$/) {
|
||||
if (round) {
|
||||
$i = scale * rint($i * logscale);
|
||||
} else {
|
||||
$i = scale * $i * logscale;
|
||||
}
|
||||
}
|
||||
}
|
||||
print;
|
||||
}
|
||||
30
language_model/srilm-1.7.3/utils/src/make-abs-discount.gawk
Executable file
30
language_model/srilm-1.7.3/utils/src/make-abs-discount.gawk
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-abs-discount --
|
||||
# computes the absolute (constant) discount values from Good-Turing
|
||||
# counts-of-counts statistics. (Only the n1 and n2 statistics are used.)
|
||||
#
|
||||
# usage: make-abs-discount COUNTFILE
|
||||
#
|
||||
# where COUNTFILE was created with get-gt-counts.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-abs-discount.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
$1 == 1 {
|
||||
gt1count = $2;
|
||||
}
|
||||
$1 == 2 {
|
||||
gt2count = $2;
|
||||
}
|
||||
END {
|
||||
if (gt1count == 0) {
|
||||
print "n1 count is zero" >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
if (gt2count == 0) {
|
||||
print "n2 count is zero" >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
print gt1count/(gt1count + 2 * gt2count);
|
||||
}
|
||||
|
||||
112
language_model/srilm-1.7.3/utils/src/make-batch-counts
Executable file
112
language_model/srilm-1.7.3/utils/src/make-batch-counts
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# make-batch-counts --
|
||||
# generate n-gram counts in batches
|
||||
#
|
||||
# A list of data files is partitioned into batches, results from each of
|
||||
# which are deposited in a separate ngram-count file.
|
||||
#
|
||||
# usage: make-batch-count file-list [batch-size [filter \
|
||||
# [countdir [options]]]]
|
||||
#
|
||||
# file-list is a file containing a list of data files
|
||||
# (lines starting with # are ignored)
|
||||
# batch-size is the number of input files per batch
|
||||
# filter is preprocessor filter to condition the data
|
||||
# countdir is the directory where count files are deposited
|
||||
# options are arguments passed on to ngram-count
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-batch-counts,v 1.8 2013/03/19 18:37:52 stolcke Exp $
|
||||
#
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "usage: $0 file-list [batch-size [filter [countdir [options]]]]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
filelist=$1
|
||||
batchsize=${2-10}
|
||||
filter=${3-/bin/cat}
|
||||
countdir=${4-./counts}
|
||||
|
||||
case $# in
|
||||
1) shift;;
|
||||
2) shift; shift;;
|
||||
3) shift; shift; shift;;
|
||||
4) shift; shift; shift; shift;;
|
||||
esac
|
||||
|
||||
options="$@"
|
||||
|
||||
what=`basename $filelist .files`
|
||||
statsfile=$countdir/$what.stats
|
||||
infiles=$countdir/$what.files
|
||||
|
||||
set -e
|
||||
|
||||
if [ ! -d $countdir ]; then
|
||||
mkdir $countdir
|
||||
fi
|
||||
|
||||
trap 'rm -f $newfile $test_in $test_out; exit 1' 1 2 15
|
||||
|
||||
# determine if ngram-count can generate compressed files
|
||||
test_in=$countdir/testin
|
||||
test_out=$countdir/testout.gz
|
||||
|
||||
echo x > $test_in
|
||||
ngram-count -text $test_in -write $test_out
|
||||
if gzip -l $test_out >/dev/null 2>&1; then
|
||||
gz=.gz
|
||||
else
|
||||
gz=
|
||||
fi
|
||||
rm $test_in $test_out
|
||||
|
||||
> $statsfile
|
||||
|
||||
#
|
||||
# format filelist into one batch per line, preceded by line number
|
||||
#
|
||||
${GAWK-gawk} -v batchsize=$batchsize \
|
||||
'BEGIN {
|
||||
batchno = 1;
|
||||
}
|
||||
/^#/ || NF == 0 {
|
||||
next;
|
||||
}
|
||||
{
|
||||
files = files " " $0;
|
||||
numfiles += 1;
|
||||
|
||||
if (numfiles >= batchsize) {
|
||||
print batchno, files;
|
||||
files = "";
|
||||
numfiles = 0;
|
||||
batchno += 1;
|
||||
}
|
||||
}
|
||||
END {
|
||||
if (numfiles > 0) {
|
||||
print batchno, files;
|
||||
}
|
||||
}' $filelist | \
|
||||
while read fileno datafiles; do
|
||||
newfile=$countdir/$what-$fileno.ngrams$gz
|
||||
|
||||
# avoid including $datafiles on command line to avoid length limit
|
||||
cat <<EOF >&2
|
||||
counting in $newfile sources $datafiles
|
||||
EOF
|
||||
|
||||
echo $datafiles | \
|
||||
xargs $filter | \
|
||||
ngram-count -text - \
|
||||
-tag $newfile \
|
||||
-sort \
|
||||
-write-order 0 \
|
||||
-write $newfile \
|
||||
$options \
|
||||
2>> $statsfile
|
||||
done
|
||||
|
||||
276
language_model/srilm-1.7.3/utils/src/make-big-lm
Executable file
276
language_model/srilm-1.7.3/utils/src/make-big-lm
Executable file
@@ -0,0 +1,276 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# make-big-lm --
|
||||
# Create a large ngram language model
|
||||
#
|
||||
# This script automates various techniques for building large ngram models.
|
||||
# It is useful for building LMs that would exceed available real memory
|
||||
# if built in one pass by ngram-count.
|
||||
# The techiques employed are
|
||||
# - Assume counts are already produced
|
||||
# (typically using make-batch-counts/merge-batch-counts)
|
||||
# - Compute Good Turing discounts without loading all counts
|
||||
# into memory.
|
||||
# - ngram-counts loads only those counts exceeding cutoff values.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-big-lm,v 1.25 2015-05-27 08:10:52 stolcke Exp $
|
||||
#
|
||||
|
||||
name=biglm
|
||||
order=3
|
||||
gt1min=1
|
||||
gt2min=1
|
||||
gt3min=2
|
||||
gt4min=2
|
||||
gt5min=2
|
||||
gt6min=2
|
||||
gt7min=2
|
||||
gt8min=2
|
||||
gt9min=2
|
||||
gt1max=7
|
||||
gt2max=7
|
||||
gt3max=7
|
||||
gt4max=7
|
||||
gt5max=7
|
||||
gt6max=7
|
||||
gt7max=7
|
||||
gt8max=7
|
||||
gt9max=7
|
||||
kndiscount1=0
|
||||
kndiscount2=0
|
||||
kndiscount3=0
|
||||
kndiscount4=0
|
||||
kndiscount5=0
|
||||
kndiscount6=0
|
||||
kndiscount7=0
|
||||
kndiscount8=0
|
||||
kndiscount9=0
|
||||
ukndiscount1=0
|
||||
ukndiscount2=0
|
||||
ukndiscount3=0
|
||||
ukndiscount4=0
|
||||
ukndiscount5=0
|
||||
ukndiscount6=0
|
||||
ukndiscount7=0
|
||||
ukndiscount8=0
|
||||
ukndiscount9=0
|
||||
using_kn=
|
||||
max_per_file=10000000
|
||||
ngram_filter=cat
|
||||
subset_filter=cat
|
||||
counts=
|
||||
test_data=
|
||||
|
||||
trust_totals=0
|
||||
metatag=__meta__ # lowercase so it works with ngram-count -tolower
|
||||
|
||||
# avoid locale problems with gawk script computing discounting parameters
|
||||
LC_NUMERIC=C; export LC_NUMERIC
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-name) name=$2; shift ;;
|
||||
-order) order=$2 ; shift ;;
|
||||
-gt1min) gt1min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt2min) gt2min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt3min) gt3min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt4min) gt4min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt5min) gt5min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt6min) gt6min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt7min) gt7min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt8min) gt8min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt9min) gt9min=$2; options="$options $1 $2" ; shift ;;
|
||||
-gt1max) gt1max=$2; using_gt=1; shift ;;
|
||||
-gt2max) gt2max=$2; using_gt=1; shift ;;
|
||||
-gt3max) gt3max=$2; using_gt=1; shift ;;
|
||||
-gt4max) gt4max=$2; using_gt=1; shift ;;
|
||||
-gt5max) gt5max=$2; using_gt=1; shift ;;
|
||||
-gt6max) gt6max=$2; using_gt=1; shift ;;
|
||||
-gt7max) gt7max=$2; using_gt=1; shift ;;
|
||||
-gt8max) gt8max=$2; using_gt=1; shift ;;
|
||||
-gt9max) gt9max=$2; using_gt=1; shift ;;
|
||||
-kndiscount1) kndiscount1=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount2) kndiscount2=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount3) kndiscount3=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount4) kndiscount4=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount5) kndiscount5=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount6) kndiscount6=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount7) kndiscount7=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount8) kndiscount8=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount9) kndiscount9=1; ukndiscount1=1; using_kn=1 ;;
|
||||
-kndiscount) kndiscount1=1; kndiscount2=1; kndiscount3=1;
|
||||
kndiscount4=1; kndiscount5=1; kndiscount6=1;
|
||||
kndiscount7=1; kndiscount8=1; kndiscount9=1;
|
||||
using_kn=1 ;;
|
||||
-ukndiscount1) kndiscount1=1; using_kn=1 ;;
|
||||
-ukndiscount2) kndiscount2=1; using_kn=1 ;;
|
||||
-ukndiscount3) kndiscount3=1; using_kn=1 ;;
|
||||
-ukndiscount4) kndiscount4=1; using_kn=1 ;;
|
||||
-ukndiscount5) kndiscount5=1; using_kn=1 ;;
|
||||
-ukndiscount6) kndiscount6=1; using_kn=1 ;;
|
||||
-ukndiscount7) kndiscount7=1; using_kn=1 ;;
|
||||
-ukndiscount8) kndiscount8=1; using_kn=1 ;;
|
||||
-ukndiscount9) kndiscount9=1; using_kn=1 ;;
|
||||
-ukndiscount) kndiscount1=1; kndiscount2=1; kndiscount3=1;
|
||||
kndiscount4=1; kndiscount5=1; kndiscount6=1;
|
||||
kndiscount7=1; kndiscount8=1; kndiscount9=1;
|
||||
ukndiscount1=1; ukndiscount2=1; ukndiscount3=1;
|
||||
ukndiscount4=1; ukndiscount5=1; ukndiscount6=1;
|
||||
ukndiscount7=1; ukndiscount8=1; ukndiscount9=1;
|
||||
using_kn=1 ;;
|
||||
-wbdiscount) using_wb=1 ;;
|
||||
-wbdiscount*|-cdiscount*|-ndiscount*|-addsmooth*)
|
||||
echo "$0: must use one of GT, KN, UKN, or WB discounting for all orders" >&2
|
||||
exit 2 ;;
|
||||
-read) if [ "$2" = "" -o "$2" = - -o "$2" = "/dev/stdin" ]; then
|
||||
echo "$0: cannot read from stdin" >&2
|
||||
exit 2
|
||||
fi
|
||||
counts="$counts $2" ; shift ;;
|
||||
-trust-totals) trust_totals=1 ;;
|
||||
-max-per-file) max_per_file=$2 ; shift ;;
|
||||
-ngram-filter) ngram_filter="$2" ; shift ;;
|
||||
-text) test_data="$2"; shift ;;
|
||||
*) options="$options $1" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [ -z "$counts" ]; then
|
||||
echo "No counts specified" >&2
|
||||
echo "usage: $0 -read COUNTS [-name PATH] [-text TESTSET] [-ngram-filter FILTER] [-max-per-file N] [ngram-count-options ...]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ -n "$using_gt" -a -n "$using_kn" -o \
|
||||
-n "$using_gt" -a -n "$using_wb" -o \
|
||||
-n "$using_kn" -a -n "$using_wb" ]
|
||||
then
|
||||
echo "$0: cannot mix GT, KN, and WB discounting" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ $trust_totals -eq 0 ]; then
|
||||
options="$options -meta-tag $metatag"
|
||||
else
|
||||
if [ "$using_kn" ]; then
|
||||
echo "$0: -trust-totals incompatible with KN discounting; ignoring it" >&2
|
||||
options="$options -meta-tag $metatag"
|
||||
else
|
||||
options="$options -trust-totals"
|
||||
fi
|
||||
fi
|
||||
|
||||
set -e
|
||||
|
||||
#
|
||||
# if KN smoothing is used, compute the modified lower-order counts
|
||||
#
|
||||
if [ "$using_kn" ]; then
|
||||
kncounts=$name.kncounts.gz
|
||||
if [ -f $kncounts ]; then
|
||||
echo "using existing $kncounts" >&2
|
||||
elif [ $order -eq 1 ]; then
|
||||
# create a dummy empty file
|
||||
gzip -f < /dev/null > $kncounts
|
||||
else
|
||||
mkdir -p $name.kndir
|
||||
gzip -dcf $counts | \
|
||||
eval "$ngram_filter" | \
|
||||
(set -x; make-kn-counts \
|
||||
no_max_order=1 max_per_file=$max_per_file \
|
||||
order=$order \
|
||||
kndiscount1=$kndiscount1 kndiscount2=$kndiscount2 \
|
||||
kndiscount3=$kndiscount3 kndiscount4=$kndiscount4 \
|
||||
kndiscount5=$kndiscount5 kndiscount6=$kndiscount6 \
|
||||
kndiscount7=$kndiscount7 kndiscount8=$kndiscount8 \
|
||||
kndiscount9=$kndiscount9 \
|
||||
output=$name.kndir/kncounts)
|
||||
(set -x; merge-batch-counts $name.kndir)
|
||||
|
||||
# this will fail if more than one count file is left in kndir,
|
||||
# i.e., if merging didn't finish successfully
|
||||
mv `find $name.kndir -name \*.ngrams.gz -print ` $kncounts
|
||||
fi
|
||||
|
||||
options="$options -kn-counts-modified"
|
||||
fi
|
||||
|
||||
#
|
||||
# compute counts-of-counts
|
||||
#
|
||||
if [ "$using_wb" ]; then
|
||||
:
|
||||
elif [ -f $name.gt${order}counts ]; then
|
||||
echo "using existing gtcounts" >&2
|
||||
else
|
||||
if [ "$using_kn" ]; then
|
||||
# concatenate KN modified counts with highest-order original counts
|
||||
# Note: even though $kncounts ends in .gz it might be a plain file
|
||||
# if platform doesn't support gzip pipes, so use gzip -df .
|
||||
gzip -dcf $kncounts | ${GAWK-gawk} 'NF < 1+'$order
|
||||
gzip -dcf $counts | eval "$ngram_filter" | ${GAWK-gawk} 'NF == 1+'$order
|
||||
else
|
||||
gzip -dcf $counts | eval "$ngram_filter"
|
||||
fi | (set -x; get-gt-counts out=$name max=20 maxorder=$order)
|
||||
fi
|
||||
|
||||
#
|
||||
# compute discount factors
|
||||
#
|
||||
if [ "$using_wb" ]; then
|
||||
# apply WB discount to all ngram orders
|
||||
gtflags=-wbdiscount
|
||||
else
|
||||
gtflags=
|
||||
fi
|
||||
for n in 1 2 3 4 5 6 7 8 9
|
||||
do
|
||||
if [ $n -le $order -a -f $name.gt${n}counts ]; then
|
||||
if (set +e; eval [ \"\$ukndiscount${n}\" -eq 1 ]); then
|
||||
gtflags="$gtflags -kn${n} $name.kn${n}"
|
||||
eval make-kn-discounts modified=0 \
|
||||
min=\$gt${n}min $name.gt${n}counts > $name.kn${n}
|
||||
elif (set +e; eval [ \"\$kndiscount${n}\" -eq 1 ]); then
|
||||
gtflags="$gtflags -kn${n} $name.kn${n}"
|
||||
eval make-kn-discounts \
|
||||
min=\$gt${n}min $name.gt${n}counts > $name.kn${n}
|
||||
else
|
||||
gtflags="$gtflags -gt${n} $name.gt${n}"
|
||||
eval make-gt-discounts \
|
||||
min=\$gt${n}min max=\$gt${n}max \
|
||||
$name.gt${n}counts > $name.gt${n}
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# if test data is specified compute context ngrams
|
||||
if [ -n "$test_data" -a $order -gt 1 ]; then
|
||||
order1=`expr $order - 1`
|
||||
(set -x; \
|
||||
ngram-count -order $order1 -text "$test_data" -sort -write $name.contexts)
|
||||
|
||||
# ... and filter the ngrams to contain only the required contexts
|
||||
subset_filter="subset-context-ngrams contexts=$name.contexts"
|
||||
fi
|
||||
|
||||
#
|
||||
# filter counts and build lm
|
||||
#
|
||||
if [ "$using_kn" ]; then
|
||||
# concatenate KN modified counts with highest-order original counts
|
||||
# Note: even though $kncounts ends in .gz it might be a plain file
|
||||
# if platform doesn't support gzip pipes, so use gzip -df .
|
||||
gzip -dcf $kncounts | ${GAWK-gawk} 'NF < 1+'$order
|
||||
gzip -dcf $counts | eval "$ngram_filter" | ${GAWK-gawk} 'NF == 1+'$order
|
||||
else
|
||||
gzip -dcf $counts | eval "$ngram_filter"
|
||||
fi | \
|
||||
eval "$subset_filter" | \
|
||||
(set -x; \
|
||||
ngram-count -read - -read-with-mincounts -order $order \
|
||||
$gtflags \
|
||||
$options)
|
||||
|
||||
rm -f $name.contexts
|
||||
|
||||
89
language_model/srilm-1.7.3/utils/src/make-diacritic-map.gawk
Executable file
89
language_model/srilm-1.7.3/utils/src/make-diacritic-map.gawk
Executable file
@@ -0,0 +1,89 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-diacritic-map --
|
||||
# Generate a map from ascii to accented word forms
|
||||
# for use with disambig(1)
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-diacritic-map.gawk,v 1.3 1998/02/04 20:28:02 stolcke Exp $
|
||||
#
|
||||
/^#/ {
|
||||
next;
|
||||
}
|
||||
function asciify(word) {
|
||||
gsub("<22>", "A", word);
|
||||
gsub("<22>", "A", word);
|
||||
gsub("<22>", "A", word);
|
||||
gsub("<22>", "A", word);
|
||||
gsub("<22>", "A", word);
|
||||
gsub("<22>", "A", word);
|
||||
gsub("<22>", "AE", word);
|
||||
gsub("<22>", "C", word);
|
||||
gsub("<22>", "E", word);
|
||||
gsub("<22>", "E", word);
|
||||
gsub("<22>", "E", word);
|
||||
gsub("<22>", "E", word);
|
||||
gsub("<22>", "I", word);
|
||||
gsub("<22>", "I", word);
|
||||
gsub("<22>", "I", word);
|
||||
gsub("<22>", "I", word);
|
||||
gsub("<22>", "N", word);
|
||||
gsub("<22>", "O", word);
|
||||
gsub("<22>", "O", word);
|
||||
gsub("<22>", "O", word);
|
||||
gsub("<22>", "O", word);
|
||||
gsub("<22>", "O", word);
|
||||
gsub("<22>", "O", word);
|
||||
gsub("<22>", "U", word);
|
||||
gsub("<22>", "U", word);
|
||||
gsub("<22>", "U", word);
|
||||
gsub("<22>", "U", word);
|
||||
gsub("<22>", "Y", word);
|
||||
gsub("<22>", "ss", word);
|
||||
gsub("<22>", "a", word);
|
||||
gsub("<22>", "a", word);
|
||||
gsub("<22>", "a", word);
|
||||
gsub("<22>", "a", word);
|
||||
gsub("<22>", "a", word);
|
||||
gsub("<22>", "a", word);
|
||||
gsub("<22>", "a", word);
|
||||
gsub("<22>", "c", word);
|
||||
gsub("<22>", "e", word);
|
||||
gsub("<22>", "e", word);
|
||||
gsub("<22>", "e", word);
|
||||
gsub("<22>", "e", word);
|
||||
gsub("<22>", "i", word);
|
||||
gsub("<22>", "i", word);
|
||||
gsub("<22>", "i", word);
|
||||
gsub("<22>", "i", word);
|
||||
gsub("<22>", "n", word);
|
||||
gsub("<22>", "o", word);
|
||||
gsub("<22>", "o", word);
|
||||
gsub("<22>", "o", word);
|
||||
gsub("<22>", "o", word);
|
||||
gsub("<22>", "o", word);
|
||||
gsub("<22>", "u", word);
|
||||
gsub("<22>", "u", word);
|
||||
gsub("<22>", "u", word);
|
||||
gsub("<22>", "u", word);
|
||||
gsub("<22>", "y", word);
|
||||
return word;
|
||||
}
|
||||
{
|
||||
word = $1;
|
||||
asciiword = asciify(word);
|
||||
|
||||
if (asciiword in map) {
|
||||
map[asciiword] = map[asciiword] " " word;
|
||||
} else {
|
||||
map[asciiword] = word;
|
||||
}
|
||||
}
|
||||
END {
|
||||
print "<s>\t<s>"
|
||||
print "</s>\t</s>"
|
||||
fflush()
|
||||
|
||||
for (w in map) {
|
||||
print w "\t" map[w] | "sort";
|
||||
}
|
||||
}
|
||||
124
language_model/srilm-1.7.3/utils/src/make-google-ngrams.gawk
Executable file
124
language_model/srilm-1.7.3/utils/src/make-google-ngrams.gawk
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-google-ngrams --
|
||||
# split ngram count file into an indexed directory structure
|
||||
# compatible with the Google ngrams distributed by LDC
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-google-ngrams.gawk,v 1.6 2010/08/20 00:17:18 stolcke Exp $
|
||||
#
|
||||
# usage: zcat counts.gz | make-google-ngrams [dir=DIR] [per_file=N] [gzip=0] [yahoo=1]
|
||||
#
|
||||
# INPUT DATA is assumed to be a sorted ngram count file
|
||||
#
|
||||
#
|
||||
# OUTPUT DATA FORMAT
|
||||
#
|
||||
# a) top-level directory
|
||||
# doc: documentation
|
||||
# data: data
|
||||
# (the top-level structure is required by LDC)
|
||||
# b) data directory
|
||||
# one sub-directory per n-gram order: 1gms, 2gms, 3gms, 4gms, 5gms
|
||||
# (separating the orders makes it easier for people to use smaller orders)
|
||||
# c) contents of sub-directory 1gms
|
||||
# - file 'vocab.gz' contains the vocabulary sorted by word in unix
|
||||
# sort-order. Each word is on its own line:
|
||||
# WORD <tab> COUNT
|
||||
# - file 'vocab_cs.gz' contains the same data as 'vocab.gz' but
|
||||
# sorted by count.
|
||||
# (need to be 8+3 file names)
|
||||
# d) contents of sub-directories 2gms, 3gms, 4gms, 5gms:
|
||||
# - files 'Ngm-KKKK.gz' where N is the order of the n-grams
|
||||
# and KKKK is the zero-padded number of the file. Each file contains
|
||||
# 10 million n-gram entries. N-grams are unix-sorted. Each
|
||||
# n-gram occupies one line:
|
||||
# WORD1 <space> WORD2 <space> ... WORDN <tab> COUNT
|
||||
# - file 'Ngm.idx' where N is the order of the n-grams, with one line for
|
||||
# each n-gram file:
|
||||
# FILENAME <tab> FIRST_NGRAM_IN_FILE
|
||||
|
||||
BEGIN {
|
||||
dir = "data";
|
||||
|
||||
per_file = 10000000;
|
||||
gzip = 1;
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
if (gzip) {
|
||||
gzip_cmd = "gzip";
|
||||
gzip_suff = ".gz";
|
||||
} else {
|
||||
gzip_cmd = "cat";
|
||||
gzip_suff = "";
|
||||
}
|
||||
}
|
||||
|
||||
# determine ngram length
|
||||
{
|
||||
if (yahoo) {
|
||||
order = NF - 5;
|
||||
if (order > 0) {
|
||||
$NF = $(NF-1) = $(NF-2) = $(NF-3) = "";
|
||||
}
|
||||
} else {
|
||||
order = NF - 1;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# unigrams
|
||||
#
|
||||
order == 1 {
|
||||
if (!have_dir[1]) {
|
||||
system("mkdir -p " dir "/1gms");
|
||||
have_dir[1] = 1;
|
||||
|
||||
output_file[1] = gzip_cmd " > " dir "/1gms/vocab" gzip_suff;
|
||||
}
|
||||
|
||||
print | output_file[1];
|
||||
next;
|
||||
}
|
||||
|
||||
order > 1 {
|
||||
if (output_ngram_count[order] == 0) {
|
||||
output_ngram_count[order] = 1;
|
||||
|
||||
system("mkdir -p " dir "/" order "gms");
|
||||
if (output_file[order]) close(output_file[order]);
|
||||
output_name = sprintf("%dgm-%04d%s", order, output_file_count[order] ++, gzip_suff);
|
||||
output_file[order] = gzip_cmd " > " dir "/" order "gms/" output_name;
|
||||
|
||||
ngram = $1;
|
||||
for (i = 2; i <= order; i ++) {
|
||||
ngram = ngram " " $i;
|
||||
}
|
||||
|
||||
print output_name "\t" ngram > (dir "/" order "gms/" order "gm.idx");
|
||||
}
|
||||
|
||||
print | output_file[order];
|
||||
|
||||
output_ngram_count[order] += 1;
|
||||
output_ngram_count[order] %= (per_file + 1);
|
||||
next;
|
||||
}
|
||||
|
||||
order < 1 {
|
||||
print FILENAME ": " FNR ": insufficient number of fields" > "/dev/stderr";
|
||||
print $0 > "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#
|
||||
# sort unigrams by count
|
||||
#
|
||||
END {
|
||||
close(output_file[1]);
|
||||
|
||||
if (have_dir[1]) {
|
||||
system("gzip -dcf " dir "/1gms/vocab" gzip_suff " | sort -k 2,2rn | " gzip_cmd " > " dir "/1gms/vocab_cs" gzip_suff);
|
||||
}
|
||||
}
|
||||
|
||||
76
language_model/srilm-1.7.3/utils/src/make-gt-discounts.gawk
Executable file
76
language_model/srilm-1.7.3/utils/src/make-gt-discounts.gawk
Executable file
@@ -0,0 +1,76 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-gt-discounts --
|
||||
# generate Good-Turing discounting parameters from a count-of-count
|
||||
# file
|
||||
#
|
||||
# The purpose of this script is to do the GT computation off-line,
|
||||
# without ngram-count having to read all counts into memory.
|
||||
# The output is compatible with the ngram-count -gt<n> options.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-gt-discounts.gawk,v 1.3 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
# usage: make-gt-discounts min=<mincount> max=<maxcount> countfile
|
||||
#
|
||||
BEGIN {
|
||||
min=1;
|
||||
max=7;
|
||||
}
|
||||
/^#/ {
|
||||
# skip comments
|
||||
next;
|
||||
}
|
||||
{
|
||||
countOfCounts[$1] = $2;
|
||||
}
|
||||
END {
|
||||
# Code below is essentially identical to GoodTuring::estimate()
|
||||
# (Discount.cc).
|
||||
minCount = min;
|
||||
maxCount = max;
|
||||
|
||||
if (!countOfCounts[1]) {
|
||||
printf "warning: no singleton counts\n" >> "/dev/stderr";
|
||||
maxCount = 0;
|
||||
}
|
||||
|
||||
while (maxCount > 0 && countOfCounts[maxCount + 1] == 0) {
|
||||
printf "warning: count of count %d is zero -- lowering maxcount\n", \
|
||||
maxCount + 1 >> "/dev/stderr";
|
||||
maxCount --;
|
||||
}
|
||||
|
||||
if (maxCount <= 0) {
|
||||
printf "GT discounting disabled\n" >> "/dev/stderr";
|
||||
} else {
|
||||
commonTerm = (maxCount + 1) * \
|
||||
countOfCounts[maxCount + 1] / \
|
||||
countOfCounts[1];
|
||||
|
||||
for (i = 1; i <= maxCount; i++) {
|
||||
|
||||
if (countOfCounts[i] == 0) {
|
||||
printf "warning: count of count %d is zero\n", \
|
||||
i >> "/dev/stderr";
|
||||
coeff = 1.0;
|
||||
} else {
|
||||
coeff0 = (i + 1) * countOfCounts[i+1] / \
|
||||
(i * countOfCounts[i]);
|
||||
coeff = (coeff0 - commonTerm) / (1.0 - commonTerm);
|
||||
if (coeff <= 0 || coeff0 > 1.0) {
|
||||
printf "warning: discount coeff %d is out of range: %g\n", \
|
||||
i, coeff >> "/dev/stderr";
|
||||
coeff = 1.0;
|
||||
}
|
||||
}
|
||||
discountCoeffs[i] = coeff;
|
||||
}
|
||||
}
|
||||
|
||||
printf "mincount %d\n", minCount;
|
||||
printf "maxcount %d\n", maxCount;
|
||||
|
||||
for (i = 1; i <= maxCount; i++) {
|
||||
printf "discount %d %g\n", i, discountCoeffs[i];
|
||||
}
|
||||
}
|
||||
100
language_model/srilm-1.7.3/utils/src/make-hiddens-lm.gawk
Executable file
100
language_model/srilm-1.7.3/utils/src/make-hiddens-lm.gawk
Executable file
@@ -0,0 +1,100 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-hiddens-lm --
|
||||
# Create a hidden-sentence-boundary ngram LM from a standard one
|
||||
#
|
||||
# This script edits a ARPA backoff model file as follows:
|
||||
#
|
||||
# 1 - ngrams involving <s> and </s> are duplicated using the
|
||||
# hidden segment boundary token <#s>.
|
||||
# 2 - ngrams starting with <s> are eliminated.
|
||||
# 3 - the backoff weight of <s> is set to 1.
|
||||
# this together with the previous change sets all probabilities conditioned
|
||||
# on <s> to the respective marignal probabilities without <s>.
|
||||
# 4 - ngrams ending in </s> get probability 1.
|
||||
# this avoids an end-of-sentence penalty in rescoring.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-hiddens-lm.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
sent_start = "<s>";
|
||||
sent_end = "</s>";
|
||||
hiddens = "<#s>";
|
||||
|
||||
remove_old_ngrams = 0;
|
||||
}
|
||||
NF==0 {
|
||||
print; next;
|
||||
}
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
/^.[0-9]-grams:/ {
|
||||
currorder=substr($0,2,1);
|
||||
}
|
||||
/^\\/ {
|
||||
print; next;
|
||||
}
|
||||
#
|
||||
currorder && currorder < highorder {
|
||||
if (NF < currorder + 2) {
|
||||
print $0 "\t0";
|
||||
} else {
|
||||
print;
|
||||
}
|
||||
next;
|
||||
}
|
||||
$0 ~ sent_start || $0 ~ sent_end {
|
||||
oldline = $0;
|
||||
|
||||
# modify sentence initial/final ngrams
|
||||
if ($2 == sent_end && currorder == 1) {
|
||||
sos_uniprob = $1;
|
||||
|
||||
if (no_s_end) {
|
||||
# set </s> prob to 1
|
||||
$1 = 0;
|
||||
}
|
||||
if (!remove_old_ngrams) {
|
||||
print;
|
||||
}
|
||||
next;
|
||||
} else if ($2 == sent_start && currorder == 1) {
|
||||
if (no_s_start) {
|
||||
# set <s> backoff weight to 1
|
||||
$3 = 0;
|
||||
}
|
||||
if (!remove_old_ngrams) {
|
||||
print;
|
||||
}
|
||||
|
||||
# use unigram prob from </s>
|
||||
if (sos_uniprob == "") {
|
||||
print "warning: could not find " sent_end " unigram" \
|
||||
>> "/dev/stderr";
|
||||
} else {
|
||||
oldline = sos_uniprob "\t" $2 "\t" $3;
|
||||
}
|
||||
} else if ($2 == sent_start) {
|
||||
# suppress other ngrams starting with <s>
|
||||
if (!no_s_start && !remove_old_ngrams) {
|
||||
print;
|
||||
}
|
||||
} else if ($(currorder + 1) == sent_end) {
|
||||
if (no_s_end) {
|
||||
# set </s> prob to 1
|
||||
$1 = 0;
|
||||
}
|
||||
if (!remove_old_ngrams) {
|
||||
print;
|
||||
}
|
||||
}
|
||||
|
||||
# replace <s> and </s> with <#s> and output result
|
||||
gsub(sent_start, hiddens, oldline);
|
||||
gsub(sent_end, hiddens, oldline);
|
||||
print oldline;
|
||||
next;
|
||||
}
|
||||
{ print }
|
||||
82
language_model/srilm-1.7.3/utils/src/make-kn-counts.gawk
Executable file
82
language_model/srilm-1.7.3/utils/src/make-kn-counts.gawk
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-kn-counts --
|
||||
# Modify N-gram counts for KN smoothing
|
||||
#
|
||||
# This duplicates the action of ModKneserNey::prepareCounts().
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-kn-counts.gawk,v 1.5 2007/06/16 04:51:18 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
order = 3;
|
||||
no_max_order = 0;
|
||||
|
||||
sent_start = "<s>";
|
||||
|
||||
ngram_count = "ngram-count";
|
||||
|
||||
output = "-";
|
||||
max_per_file = 0;
|
||||
|
||||
file_no = 0;
|
||||
ngram_no = 0;
|
||||
|
||||
}
|
||||
|
||||
function set_output () {
|
||||
close(output_cmd);
|
||||
|
||||
ngram_cmd = ngram_count " -order " order " -read - -sort -write ";
|
||||
|
||||
if (max_per_file > 0) {
|
||||
output_cmd = ngram_cmd output "-" ++file_no ".ngrams.gz";
|
||||
} else {
|
||||
output_cmd = ngram_cmd output;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
NR == 1 {
|
||||
kndiscount[1] = kndiscount1;
|
||||
kndiscount[2] = kndiscount2;
|
||||
kndiscount[3] = kndiscount3;
|
||||
kndiscount[4] = kndiscount4;
|
||||
kndiscount[5] = kndiscount5;
|
||||
kndiscount[6] = kndiscount6;
|
||||
kndiscount[7] = kndiscount7;
|
||||
kndiscount[8] = kndiscount8;
|
||||
kndiscount[9] = kndiscount9;
|
||||
|
||||
if (output == "-") {
|
||||
max_per_file = 0;
|
||||
}
|
||||
set_output();
|
||||
}
|
||||
|
||||
# discard ngrams not used in LM building
|
||||
NF - 1 > order {
|
||||
next;
|
||||
}
|
||||
# keep ngrams not subject to KN discounting, or those starting with <s>
|
||||
# if desired, highest-order ngrams are discarded to save space
|
||||
NF - 1 == order || !kndiscount[NF - 1] || $1 == sent_start {
|
||||
if (!no_max_order || NF - 1 < order) {
|
||||
if (max_per_file > 0 && ++ngram_no % max_per_file == 0) {
|
||||
ngram_no = 0;
|
||||
set_output();
|
||||
}
|
||||
print | output_cmd;
|
||||
}
|
||||
}
|
||||
# modify lower-order ngrams subject to KN discounting
|
||||
NF - 2 < order && kndiscount[NF - 2] && $2 != sent_start {
|
||||
$1 = $NF = "";
|
||||
|
||||
if (max_per_file > 0 && ++ngram_no % max_per_file == 0) {
|
||||
ngram_no = 0;
|
||||
set_output();
|
||||
}
|
||||
|
||||
# we let ngram-count add up the new counts for us
|
||||
print $0, 1 | output_cmd;
|
||||
}
|
||||
119
language_model/srilm-1.7.3/utils/src/make-kn-discounts.gawk
Executable file
119
language_model/srilm-1.7.3/utils/src/make-kn-discounts.gawk
Executable file
@@ -0,0 +1,119 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-kn-discounts --
|
||||
# generate modified Kneser-Ney discounting parameters from a
|
||||
# count-of-count file
|
||||
#
|
||||
# The purpose of this script is to do the KN computation off-line,
|
||||
# without ngram-count having to read all counts into memory.
|
||||
# The output is compatible with the ngram-count -kn<n> options.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-kn-discounts.gawk,v 1.7 2015-05-27 08:10:52 stolcke Exp $
|
||||
#
|
||||
# usage: make-kn-discounts modified=<0|1> min=<mincount> countfile
|
||||
#
|
||||
BEGIN {
|
||||
min = 1;
|
||||
modified = 1;
|
||||
}
|
||||
|
||||
/^#/ {
|
||||
# skip comments
|
||||
next;
|
||||
}
|
||||
|
||||
{
|
||||
countOfCounts[$1] = $2;
|
||||
if ($1 != "total" && $1 > maxCount && $2 > 0) {
|
||||
maxCount = $1;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Estimate missing counts-of-counts f(k) based on the empirical law
|
||||
#
|
||||
# log f(k) - log f(k+1) = a / k
|
||||
#
|
||||
# for some constant a dependent on the distribution.
|
||||
#
|
||||
function handle_missing_counts() {
|
||||
|
||||
#
|
||||
# compute average a value based on well-defined counts-of-counts
|
||||
#
|
||||
a_sum = 0;
|
||||
|
||||
for (k = maxCount - 1; k > 0; k --) {
|
||||
if (countOfCounts[k] == 0) break;
|
||||
|
||||
a = k * (log(countOfCounts[k]) - log(countOfCounts[k + 1]));
|
||||
|
||||
if (debug) {
|
||||
print "k = " k ", a = " a > "/dev/stderr";
|
||||
}
|
||||
|
||||
a_sum += a;
|
||||
}
|
||||
|
||||
if (maxCount - 1 == k) {
|
||||
# no data to estimate a, give up
|
||||
return;
|
||||
}
|
||||
|
||||
avg_a = a_sum / (maxCount - k - 1);
|
||||
|
||||
if (debug) {
|
||||
print "average a = " avg_a > "/dev/stderr";
|
||||
}
|
||||
|
||||
## print "avg_a", avg_a > "/dev/stderr";
|
||||
|
||||
for ( ; k > 0; k --) {
|
||||
if (countOfCounts[k] == 0) {
|
||||
countOfCounts[k] = exp(log(countOfCounts[k + 1]) + avg_a / k);
|
||||
|
||||
print "estimating missing count-of-count " k \
|
||||
" = " countOfCounts[k] > "/dev/stderr";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
# Code below is essentially identical to ModKneserNey::estimate()
|
||||
# (Discount.cc).
|
||||
|
||||
handle_missing_counts();
|
||||
|
||||
if (countOfCounts[1] == 0 || \
|
||||
countOfCounts[2] == 0 || \
|
||||
modified && countOfCounts[3] == 0 || \
|
||||
modified && countOfCounts[4] == 0) \
|
||||
{
|
||||
printf "error: one of required counts of counts is zero\n" \
|
||||
>> "/dev/stderr";
|
||||
exit(2);
|
||||
}
|
||||
|
||||
Y = countOfCounts[1]/(countOfCounts[1] + 2 * countOfCounts[2]);
|
||||
|
||||
if (modified) {
|
||||
discount1 = 1 - 2 * Y * countOfCounts[2] / countOfCounts[1];
|
||||
discount2 = 2 - 3 * Y * countOfCounts[3] / countOfCounts[2];
|
||||
discount3plus = 3 - 4 * Y * countOfCounts[4] / countOfCounts[3];
|
||||
} else {
|
||||
# original KN discounting
|
||||
discount1 = discount2 = discount3plus = Y;
|
||||
}
|
||||
|
||||
print "mincount", min;
|
||||
print "discount1", discount1;
|
||||
print "discount2", discount2;
|
||||
print "discount3+", discount3plus;
|
||||
|
||||
# check for invalid values after output, so we see where the problem is
|
||||
if (discount1 < 0 || discount2 < 0 || discount3plus < 0) {
|
||||
printf "error: one of modified KneserNey discounts is negative\n" \
|
||||
>> "/dev/stderr";
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
32
language_model/srilm-1.7.3/utils/src/make-lm-subset.gawk
Executable file
32
language_model/srilm-1.7.3/utils/src/make-lm-subset.gawk
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# filter a backoff model with a count file, so that only ngrams
|
||||
# in the countfile are represented in the output
|
||||
#
|
||||
# usage: make-lm-subset count-file bo-file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-lm-subset.gawk,v 1.3 1999/10/17 06:10:10 stolcke Exp $
|
||||
#
|
||||
ARGIND==1 {
|
||||
ngram = $0;
|
||||
sub("[ ]*[0-9]*$", "", ngram);
|
||||
count[ngram] = 1;
|
||||
next;
|
||||
}
|
||||
ARGIND==2 && /^$/ {
|
||||
print; next;
|
||||
}
|
||||
ARGIND==2 && /^\\/ {
|
||||
print; next;
|
||||
}
|
||||
ARGIND==2 && /^ngram / {
|
||||
print; next;
|
||||
}
|
||||
ARGIND==2 {
|
||||
ngram = $0;
|
||||
# strip numeric stuff
|
||||
sub("^[-.e0-9]*[ ]*", "", ngram);
|
||||
sub("[ ]*[-.e0-9]*$", "", ngram);
|
||||
if (count[ngram]) print;
|
||||
next;
|
||||
}
|
||||
73
language_model/srilm-1.7.3/utils/src/make-meta-counts.gawk
Executable file
73
language_model/srilm-1.7.3/utils/src/make-meta-counts.gawk
Executable file
@@ -0,0 +1,73 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-meta-counts --
|
||||
# Apply N-gram count cut-offs and insert meta-counts (counts-of-counts)
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-meta-counts.gawk,v 1.2 2002/07/22 21:24:45 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
order = 3;
|
||||
# trust_total=1 means we don't have to generate meta-counts, just
|
||||
# apply the cut-offs (in combination with ngram-count -trust-totals)
|
||||
trust_totals = 0;
|
||||
metatag = "__META__";
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
mincount[1] = mincount1 + 0;
|
||||
mincount[2] = mincount2 + 0;
|
||||
mincount[3] = mincount3 + 0;
|
||||
mincount[4] = mincount4 + 0;
|
||||
mincount[5] = mincount5 + 0;
|
||||
mincount[6] = mincount6 + 0;
|
||||
mincount[7] = mincount7 + 0;
|
||||
mincount[8] = mincount8 + 0;
|
||||
mincount[9] = mincount9 + 0;
|
||||
}
|
||||
|
||||
NF > order + 1 {
|
||||
next;
|
||||
}
|
||||
|
||||
NF > 1 {
|
||||
this_order = NF - 1;
|
||||
|
||||
if (!trust_totals) {
|
||||
# output buffered ngrams of higher order IF there was at least
|
||||
# one non-meta count of the respective order
|
||||
for (i = order; i > this_order; i --) {
|
||||
if (have_counts[i]) {
|
||||
printf "%s", buffer[i];
|
||||
have_counts[i] = 0;
|
||||
}
|
||||
delete buffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
if ($NF < mincount[this_order]) {
|
||||
if (trust_totals) {
|
||||
next;
|
||||
} else {
|
||||
# convert below-cutoff ngram to meta-ngram
|
||||
$this_order = metatag int($NF);
|
||||
$NF = 1;
|
||||
|
||||
# add it to buffer
|
||||
buffer[this_order] = buffer[this_order] $0 "\n";
|
||||
}
|
||||
} else {
|
||||
have_counts[this_order] = 1;
|
||||
print;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
END {
|
||||
# output any remaining buffered ngrams
|
||||
for (i = order; i >= 1; i --) {
|
||||
if (have_counts[i]) {
|
||||
printf "%s", buffer[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
70
language_model/srilm-1.7.3/utils/src/make-multiword-pfsg
Executable file
70
language_model/srilm-1.7.3/utils/src/make-multiword-pfsg
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# make-multiword-pfsg --
|
||||
# rewrite a PFSG in terms of multiwords
|
||||
#
|
||||
# usage: make-multiword-pfsg multiword-defs [pfsg] > new-pfsg
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-multiword-pfsg,v 1.5 2015-07-03 03:45:39 stolcke Exp $
|
||||
#
|
||||
|
||||
multiword_defs=${1}
|
||||
shift
|
||||
|
||||
tmpdir=${TMPDIR-/tmp}
|
||||
name="$tmpdir/name.$$"
|
||||
vocab="$tmpdir/vocab.$$"
|
||||
old_fsm="$tmpdir/infsm.$$.gz"
|
||||
class_fsm="$tmpdir/classfsm.$$"
|
||||
class_fsmc="$tmpdir/classfsmc.$$"
|
||||
mw_symbols="$tmpdir/mw_symbols.$$"
|
||||
word_symbols="$tmpdir/word_symbols.$$"
|
||||
|
||||
trap "rm -f $name $vocab $old_fsm $class_fsm $class_fsmc $mw_symbols $word_symbols; exit" 0 1 2 15
|
||||
|
||||
#
|
||||
# extract vocab and convert PFSG to FSM
|
||||
#
|
||||
${GAWK-gawk} -v name=$name -v vocab=$vocab '$1 == "name" && !have_name {
|
||||
have_name = 1;
|
||||
print $2 > name;
|
||||
}
|
||||
$1 == "nodes" {
|
||||
# collect vocabulary
|
||||
for (i = 3; i <= NF; i ++) {
|
||||
if ($i != "NULL") is_word[$i] = 1;
|
||||
}
|
||||
}
|
||||
{ print;
|
||||
}
|
||||
END {
|
||||
for (word in is_word) {
|
||||
print word > vocab
|
||||
}
|
||||
}' "$@" | \
|
||||
pfsg-to-fsm symbolic=1 | \
|
||||
gzip > $old_fsm
|
||||
|
||||
new_name=`cat $name`_multiwords
|
||||
|
||||
#
|
||||
# create multiword transducer
|
||||
# Note: this is the same as reversed class-transducer
|
||||
#
|
||||
classes-to-fsm vocab=$vocab symbolic=1 \
|
||||
isymbolfile=$mw_symbols \
|
||||
osymbolfile=$word_symbols \
|
||||
$multiword_defs > $class_fsm
|
||||
|
||||
fsmcompile -t -i $mw_symbols -o $word_symbols $class_fsm | \
|
||||
fsminvert > $class_fsmc
|
||||
|
||||
#
|
||||
# compose original FSM with multiword transducer;
|
||||
# then convert back to PFSG
|
||||
#
|
||||
{ gzip -dcf $old_fsm; rm -f $old_fsm; } | fsmcompile -i $word_symbols | \
|
||||
fsmcompose - $class_fsmc | fsmproject -o | \
|
||||
fsmprint -i $mw_symbols | fsm-to-pfsg pfsg_name=$new_name
|
||||
|
||||
|
||||
135
language_model/srilm-1.7.3/utils/src/make-nbest-pfsg.gawk
Executable file
135
language_model/srilm-1.7.3/utils/src/make-nbest-pfsg.gawk
Executable file
@@ -0,0 +1,135 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# nbest2pfsg --
|
||||
# convert Decipher N-best list to PFSG lattice
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-nbest-pfsg.gawk,v 1.5 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
initial = 0;
|
||||
final = 1;
|
||||
nodecount = 2;
|
||||
transcount = 0;
|
||||
|
||||
null = "NULL";
|
||||
|
||||
outputs[initial] = outputs[final] = null;
|
||||
|
||||
format = 0;
|
||||
name = "";
|
||||
|
||||
notree = 0; # do build prefix tree
|
||||
|
||||
scale = 0; # scaling factor for log posteriors
|
||||
amw = 1; # acoustic model weight
|
||||
lmw = 8; # language model weight
|
||||
wtw = 0; # word transition weight
|
||||
}
|
||||
|
||||
function start_hyp() {
|
||||
lastnode = initial;
|
||||
}
|
||||
|
||||
function add_word(word, weight) {
|
||||
nextnode = tree[lastnode " " word];
|
||||
if (nextnode && !notree) {
|
||||
if (weights[lastnode " " nextnode] != weight) {
|
||||
printf "inconsistent weight for transition %s -> %s\n",\
|
||||
lastnode, nextnode >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
lastnode = nextnode;
|
||||
} else {
|
||||
newnode = nodecount ++;
|
||||
outputs[newnode] = word;
|
||||
|
||||
tree[lastnode " " word] = newnode;
|
||||
weights[lastnode " " newnode] = weight;
|
||||
transcount ++;
|
||||
|
||||
lastnode = newnode;
|
||||
}
|
||||
}
|
||||
|
||||
function end_hyp(weight) {
|
||||
nextnode = tree[lastnode " " null];
|
||||
if (nextnode && !notree) {
|
||||
if (weights[lastnode " " nextnode] != weight) {
|
||||
printf "inconsistent final weight for %s\n",\
|
||||
lastnode >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
} else {
|
||||
tree[lastnode " " null] = final;
|
||||
weights[lastnode " " final] = weight;
|
||||
transcount ++;
|
||||
}
|
||||
}
|
||||
|
||||
function print_pfsg(name) {
|
||||
|
||||
printf "name %s\n", name;
|
||||
printf "nodes %d", nodecount;
|
||||
for (node = 0; node < nodecount; node ++) {
|
||||
printf " %s", outputs[node];
|
||||
}
|
||||
printf "\n";
|
||||
|
||||
printf "initial %d\n", initial;
|
||||
printf "final %d\n", final;
|
||||
|
||||
printf "transitions %d\n", transcount;
|
||||
|
||||
for (trans in weights) {
|
||||
split(trans, a);
|
||||
fromnode = a[1];
|
||||
tonode = a[2];
|
||||
|
||||
printf "%d %d %g\n", fromnode, tonode, \
|
||||
weights[fromnode " " tonode];
|
||||
}
|
||||
printf "\n";
|
||||
}
|
||||
|
||||
/^NBestList1\.0/ {
|
||||
format = 1;
|
||||
next;
|
||||
}
|
||||
/^NBestList2\.0/ {
|
||||
format = 2;
|
||||
next;
|
||||
}
|
||||
format == 0 {
|
||||
totalscore = scale * (amw * $1 + lmw * $2 + wtw * $3);
|
||||
start_hyp();
|
||||
for (i = 4; i <= NF; i ++) {
|
||||
add_word($i, 0);
|
||||
}
|
||||
end_hyp(totalscore);
|
||||
next;
|
||||
}
|
||||
format == 1 {
|
||||
totalscore = scale * substr($1, 2, length($1)-2);
|
||||
start_hyp();
|
||||
for (i = 2; i <= NF; i ++) {
|
||||
add_word($i, 0);
|
||||
}
|
||||
end_hyp(totalscore);
|
||||
next;
|
||||
}
|
||||
format == 2 {
|
||||
start_hyp();
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
add_word($i, scale * ($(i + 7) + $(i + 9)));
|
||||
}
|
||||
end_hyp(0);
|
||||
next;
|
||||
}
|
||||
END {
|
||||
if (!name) {
|
||||
name = FILENAME;
|
||||
}
|
||||
print_pfsg(name);
|
||||
}
|
||||
|
||||
351
language_model/srilm-1.7.3/utils/src/make-ngram-pfsg.gawk
Executable file
351
language_model/srilm-1.7.3/utils/src/make-ngram-pfsg.gawk
Executable file
@@ -0,0 +1,351 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-ngram-pfsg --
|
||||
# Create a Decipher PFSG from an N-gram language model
|
||||
#
|
||||
# usage: make-ngram-pfsg [debug=1] [check_bows=1] [maxorder=N] [no_empty_bo=1] backoff-lm > pfsg
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-ngram-pfsg.gawk,v 1.32 2015-07-03 03:45:38 stolcke Exp $
|
||||
#
|
||||
|
||||
#########################################
|
||||
#
|
||||
# Output format specific code
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
logscale = 2.30258509299404568402 * 10000.5;
|
||||
round = 0.5;
|
||||
start_tag = "<s>";
|
||||
end_tag = "</s>";
|
||||
null = "NULL";
|
||||
version = 0;
|
||||
top_level_name = "";
|
||||
no_empty_bo = 0;
|
||||
|
||||
if ("TMPDIR" in ENVIRON) {
|
||||
tmpdir = ENVIRON["TMPDIR"];
|
||||
} else {
|
||||
tmpdir = "/tmp"
|
||||
}
|
||||
|
||||
if ("pid" in PROCINFO) {
|
||||
pid = PROCINFO["pid"];
|
||||
} else {
|
||||
getline pid < "/dev/pid";
|
||||
}
|
||||
tmpfile = tmpdir "/pfsg." pid;
|
||||
|
||||
# hack to remove tmpfile when killed
|
||||
trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
|
||||
print "" | trap_cmd;
|
||||
|
||||
debug = 0;
|
||||
|
||||
write_contexts = "";
|
||||
read_contexts = "";
|
||||
}
|
||||
|
||||
function rint(x) {
|
||||
if (x < 0) {
|
||||
return int(x - round);
|
||||
} else {
|
||||
return int(x + round);
|
||||
}
|
||||
}
|
||||
|
||||
function scale_log(x) {
|
||||
return rint(x * logscale);
|
||||
}
|
||||
|
||||
function output_for_node(name) {
|
||||
num_words = split(name, words);
|
||||
|
||||
if (num_words == 0) {
|
||||
print "output_for_node: got empty name" >> "/dev/stderr";
|
||||
exit(1);
|
||||
} else if (words[1] == bo_name) {
|
||||
return null;
|
||||
} else if (words[num_words] == end_tag || \
|
||||
words[num_words] == start_tag)
|
||||
{
|
||||
return null;
|
||||
} else {
|
||||
return words[num_words];
|
||||
}
|
||||
}
|
||||
|
||||
function node_exists(name) {
|
||||
return (name in node_num);
|
||||
}
|
||||
|
||||
function node_index(name) {
|
||||
i = node_num[name];
|
||||
if (i == "") {
|
||||
i = num_nodes ++;
|
||||
node_num[name] = i;
|
||||
node_string[i] = output_for_node(name);
|
||||
|
||||
if (debug) {
|
||||
print "node " i " = " name ", output = " node_string[i] \
|
||||
>> "/dev/stderr";
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
function start_grammar(name) {
|
||||
num_trans = 0;
|
||||
num_nodes = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
function end_grammar(name) {
|
||||
if (!node_exists(start_tag)) {
|
||||
print start_tag " tag undefined in LM" >> "/dev/stderr";
|
||||
exit(1);
|
||||
} else if (!node_exists(end_tag)) {
|
||||
print end_tag " tag undefined in LM" >> "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf "%d pfsg nodes\n", num_nodes >> "/dev/stderr";
|
||||
printf "%d pfsg transitions\n", num_trans >> "/dev/stderr";
|
||||
|
||||
# output version id if supplied
|
||||
if (version) {
|
||||
print "version " version "\n";
|
||||
}
|
||||
|
||||
# use optional top-level grammar name if given
|
||||
print "name " (top_level_name ? top_level_name : name);
|
||||
printf "nodes %s", num_nodes;
|
||||
for (i = 0; i < num_nodes; i ++) {
|
||||
printf " %s", node_string[i];
|
||||
}
|
||||
printf "\n";
|
||||
|
||||
print "initial " node_index(start_tag);
|
||||
print "final " node_index(end_tag);
|
||||
print "transitions " num_trans;
|
||||
fflush();
|
||||
|
||||
if (close(tmpfile) < 0) {
|
||||
print "error closing tmp file" >> "/dev/stderr";
|
||||
exit(1);
|
||||
}
|
||||
system("/bin/cat " tmpfile);
|
||||
}
|
||||
|
||||
function add_trans(from, to, prob) {
|
||||
if (debug) {
|
||||
print "add_trans " from " -> " to " " prob >> "/dev/stderr";
|
||||
}
|
||||
num_trans ++;
|
||||
print node_index(from), node_index(to), scale_log(prob) > tmpfile;
|
||||
}
|
||||
|
||||
#########################################
|
||||
#
|
||||
# Generic code for parsing backoff file
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
maxorder = 0;
|
||||
grammar_name = "PFSG";
|
||||
bo_name = "__BACKOFF__";
|
||||
start_bo_name = bo_name " __FROM_START__";
|
||||
check_bows = 0;
|
||||
epsilon = 1e-5; # tolerance for lowprob detection
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
start_grammar(grammar_name);
|
||||
|
||||
if (read_contexts) {
|
||||
while ((getline context < read_contexts) > 0) {
|
||||
is_context[context] = 1;
|
||||
}
|
||||
close(read_contexts);
|
||||
}
|
||||
}
|
||||
|
||||
NF == 0 {
|
||||
next;
|
||||
}
|
||||
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
num_grams = substr($2,index($2,"=")+1);
|
||||
if (num_grams > 0) {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
|
||||
# limit maximal N-gram order if desired
|
||||
if (maxorder > 0 && order > maxorder) {
|
||||
order = maxorder;
|
||||
}
|
||||
|
||||
if (order == 1) {
|
||||
grammar_name = "UNIGRAM_PFSG";
|
||||
} else if (order == 2) {
|
||||
grammar_name = "BIGRAM_PFSG";
|
||||
} else if (order == 3) {
|
||||
grammar_name = "TRIGRAM_PFSG";
|
||||
} else {
|
||||
grammar_name = "NGRAM_PFSG";
|
||||
}
|
||||
}
|
||||
next;
|
||||
}
|
||||
|
||||
/^\\[0-9]-grams:/ {
|
||||
currorder = substr($0,2,1);
|
||||
next;
|
||||
}
|
||||
/^\\/ {
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# unigram parsing
|
||||
#
|
||||
currorder == 1 {
|
||||
first_word = last_word = ngram = $2;
|
||||
ngram_prefix = ngram_suffix = "";
|
||||
|
||||
# we need all unigram backoffs (except for </s>),
|
||||
# so fill in missing bow where needed
|
||||
if (NF == 2 && last_word != end_tag) {
|
||||
$3 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# bigram parsing
|
||||
#
|
||||
currorder == 2 {
|
||||
ngram_prefix = first_word = $2;
|
||||
ngram_suffix = last_word = $3;
|
||||
ngram = $2 " " $3;
|
||||
}
|
||||
|
||||
#
|
||||
# trigram parsing
|
||||
#
|
||||
currorder == 3 {
|
||||
first_word = $2;
|
||||
last_word = $4;
|
||||
ngram_prefix = $2 " " $3;
|
||||
ngram_suffix = $3 " " $4;
|
||||
ngram = ngram_prefix " " last_word;
|
||||
}
|
||||
|
||||
#
|
||||
# higher-order N-gram parsing
|
||||
#
|
||||
currorder >= 4 && currorder <= order {
|
||||
first_word = $2;
|
||||
last_word = $(currorder + 1);
|
||||
ngram_infix = $3;
|
||||
for (i = 4; i <= currorder; i ++ ) {
|
||||
ngram_infix = ngram_infix " " $i;
|
||||
}
|
||||
ngram_prefix = first_word " " ngram_infix;
|
||||
ngram_suffix = ngram_infix " " last_word;
|
||||
ngram = ngram_prefix " " last_word;
|
||||
}
|
||||
|
||||
#
|
||||
# shared code for N-grams of all orders
|
||||
#
|
||||
currorder <= order {
|
||||
prob = $1;
|
||||
bow = $(currorder + 2);
|
||||
|
||||
# skip backoffs that exceed maximal order,
|
||||
# but always include unigram backoffs
|
||||
if (bow != "" && (currorder == 1 || currorder < order)) {
|
||||
# remember all LM contexts for creation of N-gram transitions
|
||||
bows[ngram] = bow;
|
||||
|
||||
# To avoid empty paths through backoff, we reroute transitions
|
||||
# out of the start node to a special backoff node that does not
|
||||
# connect directly to the end node.
|
||||
if (no_empty_bo && ngram == start_tag) {
|
||||
this_bo_name = start_bo_name;
|
||||
} else {
|
||||
this_bo_name = bo_name;
|
||||
}
|
||||
|
||||
# insert backoff transitions
|
||||
if (read_contexts ? (ngram in is_context) : \
|
||||
(currorder < order - 1)) \
|
||||
{
|
||||
add_trans(this_bo_name " " ngram, this_bo_name " " ngram_suffix, bow);
|
||||
add_trans(ngram, this_bo_name " " ngram, 0);
|
||||
} else {
|
||||
add_trans(ngram, this_bo_name " " ngram_suffix, bow);
|
||||
}
|
||||
|
||||
if (write_contexts) {
|
||||
print ngram_suffix > write_contexts;
|
||||
}
|
||||
}
|
||||
|
||||
if (last_word == start_tag) {
|
||||
if (currorder > 1) {
|
||||
printf "warning: ignoring ngram into start tag %s -> %s\n", \
|
||||
ngram_prefix, last_word >> "/dev/stderr";
|
||||
}
|
||||
} else {
|
||||
# insert N-gram transition to maximal suffix of target context
|
||||
if (last_word == end_tag) {
|
||||
target = end_tag;
|
||||
} else if (ngram in bows || currorder == 1) {
|
||||
# the minimal context is unigram
|
||||
target = ngram;
|
||||
} else if (ngram_suffix in bows) {
|
||||
target = ngram_suffix;
|
||||
} else {
|
||||
target = ngram_suffix;
|
||||
for (i = 3; i <= currorder; i ++) {
|
||||
target = substr(target, length($i) + 2);
|
||||
if (target in bows) break;
|
||||
}
|
||||
}
|
||||
|
||||
if (currorder == 1 || \
|
||||
(read_contexts ? (ngram_prefix in is_context) : \
|
||||
(currorder < order))) \
|
||||
{
|
||||
add_trans(bo_name " " ngram_prefix, target, prob);
|
||||
|
||||
# Duplicate transitions out of unigram backoff for the
|
||||
# start-backoff-node
|
||||
if (no_empty_bo && \
|
||||
node_exists(start_bo_name " " ngram_prefix) && \
|
||||
target != end_tag)
|
||||
{
|
||||
add_trans(start_bo_name " " ngram_prefix, target, prob);
|
||||
}
|
||||
} else {
|
||||
add_trans(ngram_prefix, target, prob);
|
||||
}
|
||||
|
||||
if (check_bows) {
|
||||
if (currorder < order) {
|
||||
probs[ngram] = prob;
|
||||
}
|
||||
|
||||
if (ngram_suffix in probs && \
|
||||
probs[ngram_suffix] + bows[ngram_prefix] - prob > epsilon)
|
||||
{
|
||||
printf "warning: ngram loses to backoff %s -> %s\n", \
|
||||
ngram_prefix, last_word >> "/dev/stderr";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
end_grammar(grammar_name);
|
||||
}
|
||||
49
language_model/srilm-1.7.3/utils/src/make-sub-lm.gawk
Executable file
49
language_model/srilm-1.7.3/utils/src/make-sub-lm.gawk
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# make-sub-lm --
|
||||
# extract a lower-order backoff LM from a higher order one.
|
||||
#
|
||||
# usage: make-sub-lm maxorder=<n> lm-file > sub-lm-file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/make-sub-lm.gawk,v 1.2 1998/11/09 05:54:12 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
maxorder=2;
|
||||
}
|
||||
NF==0 {
|
||||
print; next;
|
||||
}
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
if (order <= maxorder) print;
|
||||
next;
|
||||
}
|
||||
/^\\[0-9]-grams:/ {
|
||||
currorder=substr($0,2,1);
|
||||
if (currorder <= maxorder) {
|
||||
print;
|
||||
} else {
|
||||
print "\n\\end\\";
|
||||
exit;
|
||||
}
|
||||
next;
|
||||
}
|
||||
/^\\/ {
|
||||
print; next;
|
||||
}
|
||||
currorder {
|
||||
if (currorder < maxorder) {
|
||||
print;
|
||||
} else if (currorder == maxorder) {
|
||||
#
|
||||
# delete backoff weight for maximal ngram
|
||||
#
|
||||
if (NF == currorder + 2) {
|
||||
$NF = "";
|
||||
}
|
||||
print;
|
||||
}
|
||||
next;
|
||||
}
|
||||
{ print }
|
||||
133
language_model/srilm-1.7.3/utils/src/merge-batch-counts
Executable file
133
language_model/srilm-1.7.3/utils/src/merge-batch-counts
Executable file
@@ -0,0 +1,133 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# merge-batch-counts --
|
||||
# combine batch count files into a single count file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/merge-batch-counts,v 1.9 2013/03/19 18:37:51 stolcke Exp $
|
||||
#
|
||||
|
||||
merge_options=
|
||||
merge_size=2
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-float-counts)
|
||||
merge_options=-float-counts
|
||||
shift
|
||||
;;
|
||||
-l) merge_size=$2
|
||||
shift; shift
|
||||
;;
|
||||
*) break
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "usage: $0 [-float-counts] [-l N] countdir [file-list | start-iter]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
countdir=${1-./counts}
|
||||
filelist=$2
|
||||
iter=0
|
||||
|
||||
mergedir=$countdir
|
||||
|
||||
merger=ngram-merge
|
||||
|
||||
newfilefile=$mergedir/newfiles$$
|
||||
|
||||
set -e
|
||||
|
||||
# find right xarg option
|
||||
if xargs -L1 </dev/null >/dev/null 2>&1; then
|
||||
xargs_l=L
|
||||
else
|
||||
xargs_l=l
|
||||
fi
|
||||
|
||||
# make sure partially generated files are deleted
|
||||
trap 'rm -f $newfile $newfilefile $test_in $test_out; exit 1' 1 2 15
|
||||
|
||||
# determine if ngram-merge can generate compressed files
|
||||
test_in=$mergedir/testin
|
||||
test_out=$mergedir/testout.gz
|
||||
|
||||
echo "x 1" > $test_in
|
||||
$merger -write $test_out $test_in $test_in
|
||||
if gzip -l $test_out >/dev/null 2>&1; then
|
||||
gz=.gz
|
||||
else
|
||||
gz=
|
||||
fi
|
||||
rm $test_in $test_out
|
||||
|
||||
case X$filelist in
|
||||
X[0-9]*)
|
||||
# restart a previous run
|
||||
what=merge
|
||||
iter=`expr $filelist + 1`
|
||||
infiles=$mergedir/$what-iter$iter.files
|
||||
find $countdir/. \( \
|
||||
-name $what-iter$filelist-\*.ngrams.gz -o \
|
||||
-name $what-iter$filelist-\*.ngrams \) -print | \
|
||||
sort | xargs -${xargs_l}2 /bin/echo > $infiles
|
||||
;;
|
||||
X)
|
||||
what=merge
|
||||
infiles=$mergedir/$what-iter$iter.files
|
||||
find $countdir/. \( \
|
||||
-name \*.ngrams.gz -o \
|
||||
-name \*.ngrams \) -print | sort | \
|
||||
xargs -${xargs_l}2 /bin/echo > $infiles
|
||||
;;
|
||||
X*)
|
||||
what=`basename $filelist .files`
|
||||
infiles=$mergedir/$what-iter$iter.files
|
||||
cat $filelist > $infiles
|
||||
;;
|
||||
esac
|
||||
|
||||
numfiles=`wc -w < $infiles`
|
||||
|
||||
while [ $numfiles -gt 1 ]; do
|
||||
echo "ITERATION $iter, $numfiles files" >&2
|
||||
fileno=1
|
||||
> $newfilefile
|
||||
while read file1 morefiles; do
|
||||
newfile=$mergedir/$what-iter$iter-$fileno.ngrams$gz
|
||||
|
||||
if [ -f $newfile ]; then
|
||||
echo "retaining old $newfile" >&2
|
||||
echo $newfile >>$newfilefile
|
||||
elif [ -z "$morefiles" ]; then
|
||||
echo "linking $file1 to $newfile" >&2
|
||||
rm -f $newfile
|
||||
ln $file1 $newfile
|
||||
|
||||
# put the linked file at the top of the file list
|
||||
# for the next iteration, to keep file sizes balanced
|
||||
mv $newfilefile $newfilefile.old
|
||||
echo $newfile >$newfilefile
|
||||
cat $newfilefile.old >> $newfilefile
|
||||
rm $newfilefile.old
|
||||
else
|
||||
echo "merging $file1 $morefiles into $newfile" >&2
|
||||
$merger $merge_options -write $newfile $file1 $morefiles
|
||||
echo $newfile >>$newfilefile
|
||||
fi
|
||||
fileno=`expr $fileno + 1`
|
||||
done < $infiles
|
||||
|
||||
xargs rm -f < $infiles
|
||||
|
||||
iter=`expr $iter + 1`
|
||||
infiles=$mergedir/$what-iter$iter.files
|
||||
cat $newfilefile | xargs -${xargs_l}$merge_size /bin/echo > $infiles
|
||||
numfiles=`wc -w < $infiles`
|
||||
done
|
||||
rm -f $newfilefile
|
||||
|
||||
echo "final counts in `cat $infiles`" >&2
|
||||
|
||||
180
language_model/srilm-1.7.3/utils/src/merge-nbest.gawk
Executable file
180
language_model/srilm-1.7.3/utils/src/merge-nbest.gawk
Executable file
@@ -0,0 +1,180 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# merge-nbest --
|
||||
# merge hyps from multiple N-best lists into a single list
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/merge-nbest.gawk,v 1.8 2010/08/20 00:17:18 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
logINF = -320;
|
||||
bytelogscale = M_LN10 * 10000.5 / 1024.0;
|
||||
|
||||
use_orig_hyps = 1;
|
||||
add_scores = 0;
|
||||
last_nbestformat = -1;
|
||||
|
||||
nbestmagic1 = "NBestList1.0";
|
||||
nbestmagic2 = "NBestList2.0";
|
||||
pause = "-pau-";
|
||||
|
||||
max_nbest = 0;
|
||||
multiwords = 0;
|
||||
multichar = "_";
|
||||
nopauses = 0;
|
||||
}
|
||||
|
||||
function log10(x) {
|
||||
return log(x) / M_LN10;
|
||||
}
|
||||
function exp10(x) {
|
||||
if (x < logINF) {
|
||||
return 0;
|
||||
} else {
|
||||
return exp(x * M_LN10);
|
||||
}
|
||||
}
|
||||
function addlogs(x,y) {
|
||||
if (x<y) {
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
return x + log10(1 + exp10(y - x));
|
||||
}
|
||||
|
||||
function process_nbest(file) {
|
||||
input = "exec gzip -dcf " file;
|
||||
|
||||
nbestformat = 0;
|
||||
num_hyps = 0;
|
||||
|
||||
while ((status = (input | getline)) > 0) {
|
||||
if ($1 == nbestmagic1) {
|
||||
nbestformat = 1;
|
||||
} else if ($1 == nbestmagic2) {
|
||||
nbestformat = 2;
|
||||
} else {
|
||||
words = "";
|
||||
num_words = 0;
|
||||
num_hyps ++;
|
||||
|
||||
if (max_nbest > 0 && num_hyps > max_nbest) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (nbestformat == 1) {
|
||||
for (i = 2; i <= NF; i++) {
|
||||
words = words " " $i;
|
||||
if ($i != pause) num_words ++;
|
||||
}
|
||||
score = substr($1, 2, length($1)-2)/bytelogscale;
|
||||
num_words = 1;
|
||||
} else if (nbestformat == 2) {
|
||||
prev_end_time = -1;
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
start_time = $(i + 3);
|
||||
end_time = $(i + 5);
|
||||
|
||||
# skip tokens that are subsumed by the previous word
|
||||
# (this eliminates phone and state symbols)
|
||||
# XXX: due to a bug in Decipher some state tags have
|
||||
# incorrect timemarks. We filter them based on their
|
||||
# token string.
|
||||
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
|
||||
words = words " " $i;
|
||||
if ($i != pause) num_words ++;
|
||||
prev_end_time = end_time;
|
||||
}
|
||||
}
|
||||
score = substr($1, 2, length($1)-2)/bytelogscale;
|
||||
} else {
|
||||
for (i = 4; i <= NF; i++) {
|
||||
words = words " " $i;
|
||||
}
|
||||
score = $1 + 8 * $2;
|
||||
num_words = $3;
|
||||
}
|
||||
|
||||
# resolve multiwords and eliminate pauses if so desired
|
||||
if (multiwords) {
|
||||
gsub(multichar, " ", words);
|
||||
}
|
||||
if (nopauses) {
|
||||
gsub(" " pause, " ", words);
|
||||
}
|
||||
|
||||
# if word sequence is new, record it
|
||||
if (!(words in scores)) {
|
||||
scores[words] = score;
|
||||
hyps[words] = $0;
|
||||
nwords[words] = num_words;
|
||||
} else if (add_scores) {
|
||||
scores[words] = addlogs(scores[words], score);
|
||||
}
|
||||
|
||||
if (last_nbestformat < 0) {
|
||||
last_nbestformat = nbestformat;
|
||||
} else if (nbestformat != last_nbestformat) {
|
||||
use_orig_hyps = 0;
|
||||
last_nbestformat = nbestformat;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (status < 0) {
|
||||
print "error opening " file >> "/dev/stderr";
|
||||
}
|
||||
|
||||
close(input);
|
||||
}
|
||||
|
||||
function output_nbest() {
|
||||
if (!use_orig_hyps || use_orig_hyps && last_nbestformat == 1) {
|
||||
print nbestmagic1;
|
||||
} else if (use_orig_hyps && last_nbestformat == 2) {
|
||||
print nbestmagic2;
|
||||
}
|
||||
|
||||
for (words in scores) {
|
||||
if (add_scores) {
|
||||
print scores[words], 0, nwords[words], words;
|
||||
} else if (use_orig_hyps) {
|
||||
print hyps[words];
|
||||
} else {
|
||||
print "(" (scores[words] * bytelogscale) ")" words;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
if (ARGC < 2) {
|
||||
print "usage: " ARGV[0] " N-BEST1 N-BEST2 ..." \
|
||||
>> "/dev/stderr";
|
||||
exit(2);
|
||||
}
|
||||
|
||||
for (arg = 1; arg < ARGC; arg ++) {
|
||||
if (equals = index(ARGV[arg], "=")) {
|
||||
var = substr(ARGV[arg], 1, equals - 1);
|
||||
val = substr(ARGV[arg], equals + 1);
|
||||
|
||||
if (var == "multiwords") {
|
||||
multiwords = val + 0;
|
||||
} else if (var == "multichar") {
|
||||
multichar = val;
|
||||
} else if (var == "max_nbest") {
|
||||
max_nbest = val + 0;
|
||||
} else if (var == "nopauses") {
|
||||
nopauses = val + 0;
|
||||
} else if (var == "use_orig_hyps") {
|
||||
use_orig_hyps = val + 0;
|
||||
} else if (var == "add_scores") {
|
||||
add_scores = val + 0;
|
||||
}
|
||||
} else {
|
||||
process_nbest(ARGV[arg]);
|
||||
}
|
||||
}
|
||||
|
||||
output_nbest();
|
||||
}
|
||||
|
||||
337
language_model/srilm-1.7.3/utils/src/metadb.gawk
Executable file
337
language_model/srilm-1.7.3/utils/src/metadb.gawk
Executable file
@@ -0,0 +1,337 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# metadb --
|
||||
# access the META-DB
|
||||
#
|
||||
# These files are subject to the SRILM Community Research License Version
|
||||
# 1.0 (the "License"); you may not use these files except in compliance
|
||||
# with the License. A copy of the License is included in the SRILM root
|
||||
# directory. Software distributed under the License is distributed on an
|
||||
# "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing rights and
|
||||
# limitations under the License. This software is Copyright (c) SRI
|
||||
# International, 1995-2011. All rights reserved.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/metadb.gawk,v 1.26 2011/11/26 06:22:34 stolcke Exp $
|
||||
#
|
||||
|
||||
function do_defines() {
|
||||
# process all defines
|
||||
for (d in defines) {
|
||||
gsub(d, defines[d]);
|
||||
}
|
||||
|
||||
# remove leading and trailing whitespace from value
|
||||
sub("^[ ]*", "");
|
||||
sub("[ ]*$", "");
|
||||
}
|
||||
|
||||
function print_error(msg) {
|
||||
print filename ", line " lineno ": " msg >> "/dev/stderr";
|
||||
}
|
||||
|
||||
# process an included file
|
||||
# return 1 is caller should quit reading, 0 if not
|
||||
function process_config_file(file) {
|
||||
|
||||
if (file in including) {
|
||||
print "metadb INCLUDE looping through " file >> "/dev/stderr";
|
||||
exit 2
|
||||
}
|
||||
including[file] = 1;
|
||||
|
||||
if (trace_includes) {
|
||||
print "READING " file >> "/dev/stderr";
|
||||
}
|
||||
|
||||
filename = file;
|
||||
lineno = 0;
|
||||
|
||||
while ((status = (getline < file)) > 0) {
|
||||
|
||||
lineno ++;
|
||||
|
||||
# skip comments and empty lines
|
||||
if (NF == 0 || $1 ~ /^#/) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($1 == "DEFINE") {
|
||||
if (NF < 2) {
|
||||
print_error("incomplete DEFINE");
|
||||
exit 2;
|
||||
} else {
|
||||
symbol = $2;
|
||||
|
||||
$1 = $2 = "";
|
||||
do_defines();
|
||||
|
||||
defines[symbol] = $0;
|
||||
}
|
||||
} else if ($1 == "SDEFINE") {
|
||||
if (NF < 2) {
|
||||
print_error("incomplete SDEFINE");
|
||||
exit 2;
|
||||
} else {
|
||||
symbol = $2;
|
||||
|
||||
$1 = $2 = "";
|
||||
do_defines();
|
||||
|
||||
# run right-hand-side as command and use output as value
|
||||
$0 | getline defines[symbol];
|
||||
close($0);
|
||||
}
|
||||
} else if ($1 == "MDEFINE") {
|
||||
if (NF < 2) {
|
||||
print_error("incomplete MDEFINE");
|
||||
exit 2;
|
||||
} else if (!recursive) {
|
||||
symbol = $2;
|
||||
|
||||
$1 = $2 = "";
|
||||
|
||||
# look up the right-hand-side as metadb key,
|
||||
# avoiding recursive invocations
|
||||
db_command = "metadb -recursive -config " config_file " " $0;
|
||||
if (debug) {
|
||||
print "metadb: " symbol " mdefined by: " db_command >> "/dev/stderr";
|
||||
}
|
||||
|
||||
db_command | getline defines[symbol];
|
||||
close(db_command);
|
||||
}
|
||||
} else if ($1 == "UNDEF") {
|
||||
if (NF < 2) {
|
||||
print_error("incomplete UNDEF");
|
||||
exit 2;
|
||||
} else {
|
||||
delete defines[$2];
|
||||
}
|
||||
} else if ($1 == "INCLUDE") {
|
||||
if (NF < 2) {
|
||||
print_error("missing INCLUDE filename");
|
||||
exit 1
|
||||
} else {
|
||||
$1 = "";
|
||||
do_defines();
|
||||
|
||||
if (! ($0 ~ /^\//)) {
|
||||
includefile = file;
|
||||
sub("[^/]*$", "", includefile);
|
||||
if (includefile) {
|
||||
includefile = includefile $0;
|
||||
} else {
|
||||
includefile = $0;
|
||||
}
|
||||
} else {
|
||||
includefile = $0;
|
||||
}
|
||||
|
||||
if (process_config_file(includefile)) {
|
||||
close(file);
|
||||
delete including[file];
|
||||
return 1;
|
||||
}
|
||||
filename = file;
|
||||
|
||||
if (trace_includes) {
|
||||
print "READING " file >> "/dev/stderr";
|
||||
}
|
||||
}
|
||||
} else if ($1 == "ALIAS") {
|
||||
if (NF != 3 || $2 == $3) {
|
||||
print_error("illegal ALIAS");
|
||||
exit 2
|
||||
}
|
||||
|
||||
if (dump_values) print $0;
|
||||
|
||||
if ($2 == key) {
|
||||
if (debug) {
|
||||
print "metadb: " key " redirected to " $3 >> "/dev/stderr";
|
||||
}
|
||||
|
||||
# close all currently read files so they can be read again
|
||||
# from the top
|
||||
for (f in including) {
|
||||
close(f)
|
||||
}
|
||||
|
||||
# forget all current file inclusions
|
||||
delete including;
|
||||
|
||||
key = $3;
|
||||
return process_config_file(config_file);
|
||||
}
|
||||
} else if ($1 == "ALIAS_SUFFIX") {
|
||||
if (NF != 3 || $2 == $3) {
|
||||
print_error("illegal ALIAS_SUFFIX");
|
||||
exit 2
|
||||
}
|
||||
|
||||
if (dump_values) print $0;
|
||||
|
||||
suffix_len = length($2);
|
||||
key_len = length(key);
|
||||
key_prefix = substr(key, 1, key_len-suffix_len);
|
||||
|
||||
if ($2 == substr(key, key_len-suffix_len+1) && !index(key_prefix, "_")) {
|
||||
# close all currently read files so they can be read again
|
||||
# from the top
|
||||
for (f in including) {
|
||||
close(f)
|
||||
}
|
||||
|
||||
# forget all current file inclusions
|
||||
delete including;
|
||||
|
||||
old_key = key;
|
||||
key = key_prefix $3;
|
||||
|
||||
if (debug) {
|
||||
print "metadb: " old_key " redirected to " key >> "/dev/stderr";
|
||||
}
|
||||
|
||||
return process_config_file(config_file);
|
||||
}
|
||||
} else if ($1 == key || dump_values) {
|
||||
this_key = $1;
|
||||
$1 = "";
|
||||
do_defines();
|
||||
|
||||
if ($0 == "__END__") {
|
||||
if (dump_values) {
|
||||
have_keys[this_key] = 1;
|
||||
continue;
|
||||
} else {
|
||||
close(file);
|
||||
delete including[file];
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (query_mode) {
|
||||
exit 0;
|
||||
} else if (dump_values) {
|
||||
# when dumping all keys, output the first key value found
|
||||
if (!(this_key in have_keys)) {
|
||||
print this_key, $0;
|
||||
if (!all_values) {
|
||||
have_keys[this_key] = 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (debug) {
|
||||
print "metadb: " key "=" $0 >> "/dev/stderr";
|
||||
}
|
||||
|
||||
if (!error_mode || $0 != "") {
|
||||
key_found = 1;
|
||||
print;
|
||||
}
|
||||
}
|
||||
|
||||
if (!all_values && !dump_values) {
|
||||
close(file);
|
||||
delete including[file];
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (status < 0) {
|
||||
print "error reading " file >> "/dev/stderr";
|
||||
exit 2;
|
||||
}
|
||||
close(file);
|
||||
delete including[file];
|
||||
return 0;
|
||||
}
|
||||
|
||||
function print_usage() {
|
||||
print "usage: metadb [-options ...] key1 [key2 ...]";
|
||||
print "-q query mode -- check if key is defined";
|
||||
print "-e exit with error message if key is undefined";
|
||||
print "-all return multiple key values";
|
||||
print "-dump dump all key and values";
|
||||
print "-includes list included files";
|
||||
print "-config FILE set config file (default $" db_config ")";
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
key = "";
|
||||
all_values = 0;
|
||||
dump_values = 0;
|
||||
trace_includes = 0;
|
||||
recursive = 0;
|
||||
db_config = "METADB_CONFIG";
|
||||
config_file = "";
|
||||
query_mode = 0;
|
||||
error_mode = 0;
|
||||
debug = ENVIRON["METADB_DEBUG"];
|
||||
|
||||
for (i = 1; i < ARGC ; i ++) {
|
||||
if (ARGV[i] == "-q") {
|
||||
query_mode = 1;
|
||||
} else if (ARGV[i] == "-e") {
|
||||
error_mode = 1;
|
||||
} else if (ARGV[i] == "-all") {
|
||||
all_values = 1;
|
||||
} else if (ARGV[i] == "-dump") {
|
||||
dump_values = 1;
|
||||
} else if (ARGV[i] == "-includes") {
|
||||
trace_includes = 1;
|
||||
} else if (ARGV[i] == "-recursive") {
|
||||
recursive = 1;
|
||||
} else if (ARGV[i] == "-config") {
|
||||
config_file = ARGV[i + 1];
|
||||
i ++;
|
||||
} else if (ARGV[i] == "-help") {
|
||||
print_usage();
|
||||
exit 0;
|
||||
} else if (ARGV[i] ~ /^-/) {
|
||||
print "unknown option: " ARGV[i] >> "/dev/stderr";
|
||||
exit 2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!config_file) {
|
||||
if (db_config in ENVIRON) {
|
||||
config_file = ENVIRON[db_config];
|
||||
} else {
|
||||
print db_config " not defined" >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (config_file == "") {
|
||||
print "empty config file name" >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
if (dump_values) {
|
||||
key = "";
|
||||
process_config_file(config_file);
|
||||
}
|
||||
|
||||
for ( ; i < ARGC ; i ++) {
|
||||
key = ARGV[i];
|
||||
|
||||
key_found = 0;
|
||||
process_config_file(config_file);
|
||||
|
||||
if (error_mode && !key_found) {
|
||||
print "key \"" key "\" empty or not defined in " config_file \
|
||||
>> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (query_mode) {
|
||||
# we only get here if nothing was found, so return with error
|
||||
exit 1;
|
||||
}
|
||||
}
|
||||
|
||||
56
language_model/srilm-1.7.3/utils/src/nbest-error
Executable file
56
language_model/srilm-1.7.3/utils/src/nbest-error
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# nbest-error --
|
||||
# compute minimum error of nbest lists
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-error,v 1.6 2013/03/09 07:13:02 stolcke Exp $
|
||||
#
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "usage: $0 score-dir refs [nbest-lattice-option ...]" >&2
|
||||
echo " or $0 file-list refs [nbest-lattice-option ...]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
scoredir="$1"
|
||||
refs="$2"
|
||||
shift; shift
|
||||
|
||||
option=-nbest-error
|
||||
|
||||
case "$*" in
|
||||
*-lattice-error*) option= ;;
|
||||
esac
|
||||
|
||||
if [ ! -r $scoredir ]; then
|
||||
echo "$0: cannot access $scoredir" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -r $refs ]; then
|
||||
echo "$0: cannot access $refs" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -d $scoredir ]; then
|
||||
find $scoredir -follow \
|
||||
-type f \( -name \*.score -o \
|
||||
-name \*.Z -o \
|
||||
-name \*.gz \) \
|
||||
-print | sort
|
||||
else
|
||||
cat $scoredir
|
||||
fi | \
|
||||
nbest-lattice -nbest-files - -refs $refs $option "$@" | \
|
||||
${GAWK-gawk} '
|
||||
$2 ~ /^[0-9]*$/ && $10 ~ /^[0-9]*$/ && $9 == "words" {
|
||||
nsents ++;
|
||||
nwords += $10;
|
||||
nerrors += $2;
|
||||
print;
|
||||
}
|
||||
END {
|
||||
printf "%d sentences, %d words, %d errors (%.2f%%)\n", \
|
||||
nsents, nwords, nerrors, 100*nerrors/nwords;
|
||||
}'
|
||||
|
||||
96
language_model/srilm-1.7.3/utils/src/nbest-oov-counts.gawk
Executable file
96
language_model/srilm-1.7.3/utils/src/nbest-oov-counts.gawk
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# nbest-oov-counts --
|
||||
# generate OOV counts for an nbest list
|
||||
#
|
||||
# usage: nbest-oov-counts vocab=VOCAB [vocab_aliases=ALIASES] NBESTLIST > COUNTS
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-oov-counts.gawk,v 1.2 2017/08/15 19:29:34 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
nbestformat = 0;
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList1\.0/ {
|
||||
nbestformat = 1;
|
||||
next;
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList2\.0/ {
|
||||
nbestformat = 2;
|
||||
next;
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
nwords = 0;
|
||||
while ((getline line < vocab) > 0) {
|
||||
if (split(line, a) > 0) {
|
||||
in_vocab[a[1]] = 1;
|
||||
nwords ++;
|
||||
}
|
||||
}
|
||||
print "read " nwords " vocab words" > "/dev/stderr";
|
||||
|
||||
naliases = 0;
|
||||
if (vocab_aliases) {
|
||||
while ((getline line < vocab_aliases) > 0) {
|
||||
if (split(line, a) >= 2) {
|
||||
vocab_mapping[a[1]] = a[2];
|
||||
naliases ++;
|
||||
}
|
||||
}
|
||||
print "read " naliases " vocab aliases" > "/dev/stderr";
|
||||
}
|
||||
|
||||
# add default vocabulary
|
||||
in_vocab["<s>"] = 1;
|
||||
in_vocab["</s>"] = 1;
|
||||
in_vocab["-pau-"] = 1;
|
||||
}
|
||||
|
||||
function process_word(w) {
|
||||
if (w in vocab_mapping) {
|
||||
word = vocab_mapping[w];
|
||||
} else {
|
||||
word = w;
|
||||
}
|
||||
|
||||
if (!(word in in_vocab)) {
|
||||
oov_count ++;
|
||||
}
|
||||
}
|
||||
|
||||
NF > 1 {
|
||||
oov_count = 0;
|
||||
|
||||
if (nbestformat == 1) {
|
||||
# for Decipher nbest format 1 we use the aggregate score only
|
||||
for (i = 2; i <= NF; i ++) {
|
||||
process_word($i);
|
||||
}
|
||||
} else if (nbestformat == 2) {
|
||||
prev_end_time = -1;
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
start_time = $(i + 3);
|
||||
end_time = $(i + 5);
|
||||
|
||||
# skip tokens that are subsumed by the previous word
|
||||
# (this eliminates phone and state symbols)
|
||||
# XXX: due to a bug in Decipher some state tags have incorrect
|
||||
# timemarks. We filter them based on their token string.
|
||||
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
|
||||
process_word($i);
|
||||
|
||||
prev_end_time = end_time;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 4; i <= NF; i ++) {
|
||||
process_word($i);
|
||||
}
|
||||
}
|
||||
|
||||
print oov_count;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# nbest-optimize-args-from-rover-control --
|
||||
# Extract initial score weights and arguments from rover-control file
|
||||
# for use with nbest-optimize
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-optimize-args-from-rover-control.gawk,v 1.2 2017/08/16 06:34:16 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
num_extras = 0;
|
||||
}
|
||||
|
||||
# skip comment or empty line
|
||||
/^##/ || /^[ ]*$/ {
|
||||
next;
|
||||
}
|
||||
|
||||
# extra score file line
|
||||
$3 == "+" {
|
||||
num_extras ++;
|
||||
extra_dir[num_extras] = $1;
|
||||
extra_weight[num_extras] = $2;
|
||||
next;
|
||||
}
|
||||
|
||||
# main system
|
||||
{
|
||||
system_dir = $1;
|
||||
lm_weight = $2;
|
||||
wt_weight = $3;
|
||||
max_nbest = $5;
|
||||
post_scale = $6;
|
||||
|
||||
weights = "1 " lm_weight " " wt_weight;
|
||||
for (i = 1; i <= num_extras; i ++) {
|
||||
weights = weights " " extra_weight[i];
|
||||
}
|
||||
|
||||
if (print_weights) {
|
||||
print weights;
|
||||
} else if (print_dirs) {
|
||||
for (i = 1; i <= num_extras; i ++) {
|
||||
print extra_dir[i];
|
||||
}
|
||||
} else {
|
||||
# output all arguments
|
||||
|
||||
if (post_scale != "" && post_scale != 0) {
|
||||
print "-posterior-scale " post_scale;
|
||||
}
|
||||
if (max_nbest != "" && max_nbest != 0) {
|
||||
print "-max-nbest " max_nbest;
|
||||
}
|
||||
|
||||
print "-init-lambdas '" weights "'";
|
||||
|
||||
for (i = 1; i <= num_extras; i ++) {
|
||||
print extra_dir[i];
|
||||
}
|
||||
}
|
||||
|
||||
num_extras = 0;
|
||||
}
|
||||
184
language_model/srilm-1.7.3/utils/src/nbest-posteriors.gawk
Executable file
184
language_model/srilm-1.7.3/utils/src/nbest-posteriors.gawk
Executable file
@@ -0,0 +1,184 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# nbest-posteriors --
|
||||
# rescale the scores in an nbest list to reflect weighted posterior
|
||||
# probabilities
|
||||
#
|
||||
# usage: nbest-posteriors [ weight=W amw=AMW lmw=LMW wtw=WTW postscale=S max_nbest=M ] NBEST-FILE
|
||||
#
|
||||
# The output is the same input NBEST-FILE with acoustic scores set to
|
||||
# the log10 of the posterior hyp proabilities and LM scores set to zero.
|
||||
# postscale=S attenuates the posterior distribution by dividing combined log
|
||||
# scores by S (the default is S=LMW).
|
||||
#
|
||||
# If weight=W is specified the posteriors are multiplied by W.
|
||||
# (This is useful to combine multiple nbest lists in a weighted fashion).
|
||||
# The input should be in SRILM nbest-format.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-posteriors.gawk,v 1.14 2019/02/08 14:13:35 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
M_LN10 = 2.30258509299404568402;
|
||||
|
||||
weight = 1.0;
|
||||
amw = 1.0;
|
||||
lmw = 8.0;
|
||||
wtw = 0.0;
|
||||
postscale = 0;
|
||||
max_nbest = 0;
|
||||
|
||||
logINF = -320; # log10 of smallest representable number
|
||||
log_total_numerator = logINF;
|
||||
bytelogscale = 1024.0 / 10000.5 / M_LN10;
|
||||
|
||||
nbestformat = 0;
|
||||
noheader = 0;
|
||||
|
||||
# tag to identify nbest list in output_posteriors
|
||||
nbest_tag = 1;
|
||||
}
|
||||
|
||||
function log10(x) {
|
||||
return log(x)/M_LN10;
|
||||
}
|
||||
function exp10(x) {
|
||||
if (x <= logINF) {
|
||||
return 0;
|
||||
} else {
|
||||
return exp(x * M_LN10);
|
||||
}
|
||||
}
|
||||
function addlogs(x,y) {
|
||||
if (x<y) {
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
return x + log10(1 + exp10(y - x));
|
||||
}
|
||||
|
||||
# by default, use posterior scale = lmw
|
||||
NR == 1 {
|
||||
if (!postscale) {
|
||||
if (lmw == 0) {
|
||||
postscale = 1.0;
|
||||
} else {
|
||||
postscale = lmw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList1\.0/ {
|
||||
nbestformat = 1;
|
||||
if (!noheader) {
|
||||
# keep header in output
|
||||
print;
|
||||
}
|
||||
|
||||
if (lmw != 0 || wtw != 0) {
|
||||
print "warning: cannot apply LMW or WTW to Decipher N-nbest lists" \
|
||||
>> "/dev/stderr";
|
||||
}
|
||||
|
||||
next;
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList2\.0/ {
|
||||
nbestformat = 2;
|
||||
|
||||
if (!noheader) {
|
||||
# keep header in output
|
||||
print;
|
||||
}
|
||||
|
||||
next;
|
||||
}
|
||||
|
||||
NF > 1 {
|
||||
if (max_nbest && num_hyps == max_nbest) exit;
|
||||
|
||||
num_hyps ++;
|
||||
|
||||
if (nbestformat == 1) {
|
||||
# for Decipher nbest format 1 we use the aggregate score only
|
||||
total_score = substr($1,2,length($1)-2);
|
||||
total_score *= bytelogscale * amw/postscale;
|
||||
} else if (nbestformat == 2) {
|
||||
total_score = substr($1,2,length($1)-2);
|
||||
|
||||
# compute total AC and LM scores
|
||||
lm_score = 0;
|
||||
num_tokens = 0;
|
||||
|
||||
prev_end_time = -1;
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
start_time = $(i + 3);
|
||||
end_time = $(i + 5);
|
||||
|
||||
# skip tokens that are subsumed by the previous word
|
||||
# (this eliminates phone and state symbols)
|
||||
# XXX: due to a bug in Decipher some state tags have incorrect
|
||||
# timemarks. We filter them based on their token string.
|
||||
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
|
||||
num_tokens ++;
|
||||
|
||||
lm_score += $(i + 7);
|
||||
|
||||
prev_end_time = end_time;
|
||||
}
|
||||
}
|
||||
|
||||
# Compute AC score from total and lm scores. This takes into
|
||||
# account that the recognizer might sum scores of equivalent hyps
|
||||
# (e.g., those differing only in pauses or pronunciations) and
|
||||
# reflect the summing in the total score, but not in the word AC
|
||||
# scores.
|
||||
ac_score = total_score - lm_score;
|
||||
|
||||
# Note we don't eliminate pause tokens from the word count, since
|
||||
# the recognizer includes them in word count weighting.
|
||||
# (Only after LM rescoring are pauses ignored.)
|
||||
total_score = amw * ac_score + lmw * lm_score + wtw * num_tokens;
|
||||
total_score *= bytelogscale/postscale;
|
||||
} else {
|
||||
total_score = (amw * $1 + lmw * $2 + wtw * $3)/postscale;
|
||||
}
|
||||
|
||||
if (num_hyps == 1) {
|
||||
score_offset = total_score;
|
||||
}
|
||||
|
||||
total_score -= score_offset;
|
||||
|
||||
#
|
||||
# store posteriors and hyp words
|
||||
#
|
||||
log_posteriors[num_hyps] = total_score;
|
||||
log_total_numerator = addlogs(log_total_numerator, total_score);
|
||||
|
||||
num_words[num_hyps] = $3;
|
||||
|
||||
if (nbestformat > 0) {
|
||||
$1 = "";
|
||||
} else {
|
||||
$1 = $2 = $3 = "";
|
||||
}
|
||||
hyps[num_hyps] = $0;
|
||||
}
|
||||
|
||||
END {
|
||||
for (i = 1; i <= num_hyps; i ++) {
|
||||
unweighted_logpost = log_posteriors[i] - log_total_numerator;
|
||||
logpost = log10(weight) + unweighted_logpost;
|
||||
|
||||
if (nbestformat > 0) {
|
||||
printf "(%f) %s\n", logpost / bytelogscale, hyps[i];
|
||||
} else {
|
||||
print logpost, 0, num_words[i], hyps[i];
|
||||
}
|
||||
|
||||
if (output_posteriors) {
|
||||
print nbest_tag, i, unweighted_logpost >> output_posteriors;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
316
language_model/srilm-1.7.3/utils/src/nbest-rover
Executable file
316
language_model/srilm-1.7.3/utils/src/nbest-rover
Executable file
@@ -0,0 +1,316 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# nbest-rover --
|
||||
# Combine multiple nbest lists ROVER-style
|
||||
#
|
||||
# usage: nbest-rover SENTIDS CONTROL-FILE [POSTERIORS]
|
||||
#
|
||||
# where SENTIDS is list of sentence ids (filenames of nbest lists)
|
||||
# if SENTIDS is "-" the list is inferred from the contents of
|
||||
# the first N-best directory
|
||||
# CONTROL-FILE describes the nbest list sets to be processed
|
||||
# POSTERIORS is an an optional file to which word posterior probabilities
|
||||
# are written.
|
||||
#
|
||||
# The format for CONTROL-FILE is
|
||||
#
|
||||
# DIR1 LMW1 WTW1 W1 [ N1 [ S1 ] ]
|
||||
# DIR2 LMW2 WTW2 W2 [ N2 [ S2 ] ]
|
||||
# ...
|
||||
#
|
||||
# Each DIRi names a directory in which nbest lists are to be found.
|
||||
# LMWi and WTWi are the rescoring weights to be used for the corresponding
|
||||
# directory. Wi is the weight to be given to the posteriors compute from
|
||||
# the respective list. Ni are optional limits on the number N-best hyps used.
|
||||
# Si are optional posterior scaling parameters.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-rover,v 1.43 2019/02/28 04:48:21 stolcke Exp $
|
||||
#
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "usage: $0 [ sentid-list | - ] control-file [posteriors [nbest-lattice-options]]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
sentids=$1
|
||||
control=$2
|
||||
shift; shift
|
||||
|
||||
# for new-style gnu sort
|
||||
_POSIX2_VERSION=199209
|
||||
export _POSIX2_VERSION
|
||||
|
||||
amw=1
|
||||
default_lmw=8
|
||||
default_wtw=0
|
||||
default_scale=0
|
||||
default_max_nbest=0
|
||||
default_weight=1
|
||||
|
||||
mesh_option=-use-mesh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
posteriors=$1
|
||||
shift
|
||||
else
|
||||
posteriors=/dev/null
|
||||
fi
|
||||
|
||||
lattice_dir=
|
||||
posteriors_dir=
|
||||
nbest_dir=
|
||||
ref_posteriors=
|
||||
filter_script=cat
|
||||
missing_nbest=
|
||||
use_nbest_scripts=
|
||||
debug_level=0
|
||||
null_nbest=${TMPDIR-/tmp}/$$null.nbest
|
||||
|
||||
# collect remaining options (mostly to pass them to nbest-lattice)
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-debug) debug_level=$2
|
||||
shift; shift ;;
|
||||
-amw) amw=$2;
|
||||
shift; shift ;;
|
||||
-write-dir) lattice_dir=$2
|
||||
options="$options $1 $2"
|
||||
shift; shift ;;
|
||||
-write-nbest-dir)
|
||||
nbest_dir=$2
|
||||
options="$options $1 $2"
|
||||
shift; shift ;;
|
||||
-write-nbest-posteriors)
|
||||
posteriors_dir=$2;
|
||||
shift; shift ;;
|
||||
-write-ref-posteriors)
|
||||
ref_posteriors=$2;
|
||||
options="$options -record-hyps"
|
||||
shift; shift ;;
|
||||
-no-mesh) mesh_option= ;
|
||||
shift ;;
|
||||
-wer) # -wer implies -no-mesh
|
||||
mesh_option= ;
|
||||
options="$options $1"
|
||||
shift ;;
|
||||
-missing-nbest)
|
||||
echo "0 0 0" > $null_nbest
|
||||
missing_nbest=1
|
||||
use_nbest_scripts=1
|
||||
shift ;;
|
||||
-nbest-backtrace)
|
||||
# Decipher2 format with backtrace info
|
||||
# -- need to use old nbest helper scripts
|
||||
options="$options $1"
|
||||
use_nbest_scripts=1
|
||||
shift ;;
|
||||
-nbest-backtrace-times-only)
|
||||
# Decipher 2 format - but only timing
|
||||
# information is needed
|
||||
helper_options="-nbest-backtrace -decipher-nbest"
|
||||
options="$options -nbest-backtrace"
|
||||
shift ;;
|
||||
-filter) filter_script="$2";
|
||||
shift; shift ;;
|
||||
*) options="$options $1"
|
||||
shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
> $posteriors
|
||||
|
||||
tmpdir=${TMPDIR-/tmp}
|
||||
tmp_post=$tmpdir/post$$
|
||||
tmp_sentids=$tmpdir/sentids$$
|
||||
tmp_nbest_dir=$tmpdir/nbest.dir$$
|
||||
tmp_post_dir=$tmpdir/post.dir$$
|
||||
tmp_lat_dir=$tmpdir/lat.dir$$
|
||||
|
||||
trap "rm -rf $tmp_post $tmp_sentids $tmp_nbest_dir $tmp_post_dir $tmp_lat_dir $null_nbest; exit" 0 1 2 15
|
||||
|
||||
mkdir -p $tmp_nbest_dir $tmp_post_dir $tmp_lat_dir
|
||||
|
||||
#
|
||||
# make sentid list if none was specified
|
||||
#
|
||||
if [ "$sentids" = "-" ]; then
|
||||
${GAWK-gawk} '{ print $1; exit }' $control | xargs ls | \
|
||||
sed -e 's,.*/,,' -e 's,\.gz$,,' -e 's,\.score$,,' | \
|
||||
sort > $tmp_sentids
|
||||
else
|
||||
sort +0 -1 $sentids > $tmp_sentids
|
||||
fi
|
||||
|
||||
set -e
|
||||
|
||||
#
|
||||
# create lattice output directory if needed
|
||||
#
|
||||
if [ -n "$lattice_dir" ]; then
|
||||
mkdir -p "$lattice_dir"
|
||||
elif [ -n "$ref_posteriors" ]; then
|
||||
lattice_dir=$tmp_lat_dir
|
||||
options="$options -write-dir $lattice_dir"
|
||||
fi
|
||||
|
||||
if [ -n "$nbest_dir" ]; then
|
||||
mkdir -p "$nbest_dir"
|
||||
fi
|
||||
|
||||
if [ -n "$posteriors_dir" ]; then
|
||||
mkdir -p "$posteriors_dir"
|
||||
elif [ -n "$ref_posteriors" ]; then
|
||||
posteriors_dir=$tmp_post_dir
|
||||
fi
|
||||
|
||||
cat $tmp_sentids | \
|
||||
while read sentid refwords
|
||||
do
|
||||
extra_weights=
|
||||
extra_scores=
|
||||
extra_wts_and_scores=
|
||||
|
||||
noheader=0
|
||||
|
||||
nbest_tag=1
|
||||
|
||||
if [ -n "$posteriors_dir" ]; then
|
||||
posteriors_file=$posteriors_dir/$sentid
|
||||
> $posteriors_file
|
||||
else
|
||||
posteriors_file=
|
||||
fi
|
||||
|
||||
if [ -n "$use_nbest_scripts" ]; then
|
||||
# handle DOS EOL, comment and empty lines
|
||||
sed -e 's,
|
||||
$,,' -e '/^##/d' -e '/^[ ]*$/d' $control | \
|
||||
while read dir lmw wtw weight max_nbest scale rest
|
||||
do
|
||||
if [ "$wtw" = "+" ]; then
|
||||
if [ -f $dir/$sentid.gz ]; then
|
||||
extra_scores="$extra_scores $dir/$sentid.gz"
|
||||
extra_wts_and_scores="$extra_wts_and_scores $lmw $dir/$sentid.gz"
|
||||
elif [ -f $dir/$sentid ]; then
|
||||
extra_scores="$extra_scores $dir/$sentid"
|
||||
extra_wts_and_scores="$extra_wts_and_scores $lmw $dir/$sentid"
|
||||
else
|
||||
echo "$dir/$sentid" is missing >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
extra_weights="$extra_weights $lmw"
|
||||
continue
|
||||
else
|
||||
if [ -f $dir/$sentid ]; then
|
||||
nbest_file=$dir/$sentid
|
||||
elif [ -f $dir/$sentid.gz ]; then
|
||||
nbest_file=$dir/$sentid.gz
|
||||
elif [ -f $dir/$sentid.score.gz ]; then
|
||||
nbest_file=$dir/$sentid.score.gz
|
||||
elif [ -f $dir/$sentid.score ]; then
|
||||
nbest_file=$dir/$sentid.score
|
||||
else
|
||||
echo -n "$dir/$sentid.score.gz is missing" >&2
|
||||
extra_weights=
|
||||
extra_scores=
|
||||
extra_wts_and_scores=
|
||||
|
||||
if [ -n "$missing_nbest" ]; then
|
||||
echo " - using empty hyp" >&2
|
||||
nbest_file=$null_nbest
|
||||
else
|
||||
echo "" >&2
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$weight" = "=" ]; then
|
||||
weight=$last_weight
|
||||
else
|
||||
last_weight=$weight
|
||||
fi
|
||||
|
||||
if [ -n "$extra_weights" -o "$amw" != 1 ]; then
|
||||
combine-acoustic-scores \
|
||||
-v "weights=$amw $extra_weights" \
|
||||
-v max_nbest=${max_nbest:-$default_max_nbest} \
|
||||
$nbest_file $extra_scores
|
||||
else
|
||||
gzip -dcf $nbest_file
|
||||
fi | \
|
||||
nbest-posteriors noheader=$noheader \
|
||||
lmw=${lmw:-$default_lmw} \
|
||||
wtw=${wtw:-$default_wtw} \
|
||||
weight=${weight:-$default_weight} \
|
||||
max_nbest=${max_nbest:-$default_max_nbest} \
|
||||
postscale=${scale:-$default_scale} \
|
||||
nbest_tag=$nbest_tag \
|
||||
output_posteriors=$posteriors_file
|
||||
|
||||
extra_weights=
|
||||
extra_scores=
|
||||
extra_wts_and_scores=
|
||||
noheader=1
|
||||
nbest_tag=`expr $nbest_tag + 1`
|
||||
fi
|
||||
done
|
||||
else # use helper tool
|
||||
nbest-rover-helper -debug $debug_level \
|
||||
-sentid $sentid \
|
||||
-rover-control $control \
|
||||
-max-nbest $default_max_nbest \
|
||||
-rescore-amw $amw \
|
||||
-rescore-lmw $default_lmw \
|
||||
-rescore-wtw $default_wtw \
|
||||
-posterior-weight $default_weight \
|
||||
-posterior-scale $default_scale \
|
||||
-write-posteriors "$posteriors_file" \
|
||||
$helper_options
|
||||
fi | \
|
||||
eval "$filter_script" \
|
||||
> $tmp_nbest_dir/$sentid
|
||||
|
||||
if [ -n "$posteriors_file" ]; then
|
||||
gzip -f $posteriors_file
|
||||
fi
|
||||
|
||||
echo $tmp_nbest_dir/$sentid
|
||||
done | \
|
||||
nbest-lattice -nbest-files - \
|
||||
$mesh_option \
|
||||
-rescore-lmw 0 -rescore-wtw 0 \
|
||||
-posterior-amw 0 -posterior-lmw 0 -posterior-wtw 0 \
|
||||
-debug 2 $options 2>$tmp_post | \
|
||||
while read sentid hyp
|
||||
do
|
||||
# delete tmp nbest lists to avoid huge data accumulation
|
||||
if [ "$sentid" != "$last_sentid" ]; then
|
||||
rm -f $tmp_nbest_dir/$sentid
|
||||
last_sentid=$sentid
|
||||
fi
|
||||
|
||||
echo "$sentid $hyp"
|
||||
done
|
||||
|
||||
if [ -n "$ref_posteriors" ]; then
|
||||
> $ref_posteriors
|
||||
|
||||
cat $tmp_sentids | \
|
||||
while read sentid refwords
|
||||
do
|
||||
if [ -f $lattice_dir/$sentid.gz ]; then
|
||||
suffix=.gz
|
||||
else
|
||||
suffix=
|
||||
fi
|
||||
gzip -dcf $lattice_dir/$sentid$suffix | \
|
||||
find-reference-posteriors sentid=$sentid \
|
||||
posteriors_file=$posteriors_dir/$sentid$suffix >> $ref_posteriors
|
||||
done
|
||||
fi
|
||||
|
||||
# extract posteriors to file; output error messages; ignore others
|
||||
${GAWK-gawk} '$2 == "post" { $2 = ""; print; next; }
|
||||
$2 == "err" { next; }
|
||||
{ print > "/dev/stderr"; }' $tmp_post > $posteriors
|
||||
526
language_model/srilm-1.7.3/utils/src/nbest-rover-helper.cc
Normal file
526
language_model/srilm-1.7.3/utils/src/nbest-rover-helper.cc
Normal file
@@ -0,0 +1,526 @@
|
||||
/*
|
||||
* nbest-rover-helper --
|
||||
* Preprocess nbest lists for nbest-rover
|
||||
*/
|
||||
|
||||
#ifndef lint
|
||||
static char Copyright[] = "Copyright (c) 1995-2010 SRI International, 2017 Andreas Stolcke, Microsoft Corp. All Rights Reserved.";
|
||||
static char RcsId[] = "@(#)$Id: nbest-rover-helper.cc,v 1.10 2019/09/09 23:13:15 stolcke Exp $";
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <locale.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#ifndef _MSC_VER
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "option.h"
|
||||
#include "version.h"
|
||||
#include "File.h"
|
||||
|
||||
#include "Prob.h"
|
||||
#include "Vocab.h"
|
||||
#include "NBest.h"
|
||||
#include "RefList.h"
|
||||
#include "VocabMultiMap.h"
|
||||
#include "MultiwordVocab.h" // for MultiwordSeparator
|
||||
#include "Array.cc"
|
||||
#include "MStringTokUtil.h"
|
||||
|
||||
#define DEBUG_ERRORS 1
|
||||
#define DEBUG_POSTERIORS 2
|
||||
|
||||
/*
|
||||
* default value for posterior* weights to indicate they haven't been set
|
||||
*/
|
||||
static int version = 0;
|
||||
static unsigned debug = 0;
|
||||
static char *vocabFile = 0;
|
||||
static char *vocabAliasFile = 0;
|
||||
static int toLower = 0;
|
||||
static int multiwords = 0;
|
||||
static const char *multiChar = MultiwordSeparator;
|
||||
static int nbestBacktrace = 0;
|
||||
static char *rescoreFile = 0;
|
||||
static char *nbestFiles = 0;
|
||||
static char *roverControlFile = 0;
|
||||
static char *sentid = 0;
|
||||
static char *writeNbestFile = 0;
|
||||
static char *writeNbestDir = 0;
|
||||
static int writeDecipherNbest = 0;
|
||||
static unsigned maxNbest = 0;
|
||||
static double rescoreAMW = 1.0;
|
||||
static double rescoreLMW = 8.0;
|
||||
static double rescoreWTW = 0.0;
|
||||
static double posteriorScale = 0.0;
|
||||
static double posteriorWeight = 1.0;
|
||||
static int noPosteriors = 0;
|
||||
static char *writePosteriors = 0;
|
||||
static int nbestTag = 1;
|
||||
static int optRest;
|
||||
|
||||
static Option options[] = {
|
||||
{ OPT_TRUE, "version", &version, "print version information" },
|
||||
{ OPT_UINT, "debug", &debug, "debugging level" },
|
||||
{ OPT_STRING, "vocab", &vocabFile, "vocab file" },
|
||||
{ OPT_STRING, "vocab-aliases", &vocabAliasFile, "vocab alias file" },
|
||||
{ OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
|
||||
{ OPT_TRUE, "multiwords", &multiwords, "split multiwords in N-best hyps" },
|
||||
{ OPT_STRING, "multi-char", &multiChar, "multiword component delimiter" },
|
||||
{ OPT_TRUE, "nbest-backtrace", &nbestBacktrace, "read backtrace info from N-best lists" },
|
||||
|
||||
{ OPT_STRING, "rescore", &rescoreFile, "hyp stream input file to rescore" },
|
||||
{ OPT_STRING, "nbest", &rescoreFile, "same as -rescore" },
|
||||
{ OPT_STRING, "nbest-files", &nbestFiles, "list of n-best filenames" },
|
||||
{ OPT_STRING, "rover-control", &roverControlFile, "process nbest-rover control file" },
|
||||
{ OPT_STRING, "sentid", &sentid, "sentence ID string for nbest-rover control file" },
|
||||
{ OPT_STRING, "write-nbest", &writeNbestFile, "output n-best list" },
|
||||
{ OPT_STRING, "write-nbest-dir", &writeNbestDir, "output n-best directory" },
|
||||
{ OPT_TRUE, "decipher-nbest", &writeDecipherNbest, "output Decipher n-best format" },
|
||||
{ OPT_UINT, "max-nbest", &maxNbest, "maximum number of hyps to consider" },
|
||||
{ OPT_FLOAT, "rescore-amw", &rescoreAMW, "rescoring AM weight" },
|
||||
{ OPT_FLOAT, "rescore-lmw", &rescoreLMW, "rescoring LM weight" },
|
||||
{ OPT_FLOAT, "rescore-wtw", &rescoreWTW, "rescoring word transition weight" },
|
||||
{ OPT_FLOAT, "posterior-scale", &posteriorScale, "divisor for log posterior estimates" },
|
||||
{ OPT_FLOAT, "posterior-weight", &posteriorWeight, "overall weight of posterior probabilities" },
|
||||
|
||||
{ OPT_TRUE, "no-posteriors", &noPosteriors, "do not compute posterior probabilties (acoustic rescoring only)" },
|
||||
{ OPT_STRING, "write-posteriors", &writePosteriors, "append posteriors probs to file" },
|
||||
{ OPT_INT, "nbest-tag", &nbestTag, "subsystem tag number for posterior dump" },
|
||||
{ OPT_REST, "-", &optRest, "indicate end of option list" },
|
||||
{ OPT_DOC, 0, 0, "following options, an alternating list of weights and score files/directories" },
|
||||
};
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# include <errno.h>
|
||||
# include <sys/stat.h>
|
||||
|
||||
/*
|
||||
* Emulate access(2) in Windows
|
||||
*/
|
||||
#define F_OK 0
|
||||
#define R_OK 4
|
||||
#define W_OK 2
|
||||
#define X_OK 1
|
||||
|
||||
int
|
||||
access(const char *path, int mode)
|
||||
{
|
||||
struct _stat buf;
|
||||
|
||||
if (_stat(path, &buf) < 0) {
|
||||
return -1;
|
||||
} else {
|
||||
if (mode & R_OK && !(buf.st_mode & _S_IREAD)) {
|
||||
errno = EPERM;
|
||||
return -1;
|
||||
}
|
||||
if (mode & W_OK && !(buf.st_mode & _S_IWRITE)) {
|
||||
errno = EPERM;
|
||||
return -1;
|
||||
}
|
||||
if (mode & X_OK && !(buf.st_mode & _S_IEXEC)) {
|
||||
errno = EPERM;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endif /* _MSC_VER */
|
||||
|
||||
|
||||
/*
|
||||
* Read a list of scores from file
|
||||
*/
|
||||
Boolean
|
||||
readScores(const char *filename, unsigned numHyps, unsigned maxN, Array<LogP2> &scores)
|
||||
{
|
||||
unsigned numScores = 0;
|
||||
|
||||
File file(filename, "r");
|
||||
char *line;
|
||||
|
||||
while ((line = file.getline())) {
|
||||
LogP2 score;
|
||||
|
||||
if (parseLogP(line, score)) {
|
||||
scores[numScores ++] = score;
|
||||
} else {
|
||||
file.position() << "bad score value\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (maxN > 0 && numScores == maxN) break;
|
||||
}
|
||||
|
||||
if (numScores == numHyps || (maxN > 0 && numScores == maxN)) {
|
||||
return true;
|
||||
} else {
|
||||
file.position() << "mismatched number of scores -- expecting "
|
||||
<< numHyps << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Process a single N-best list
|
||||
*/
|
||||
void
|
||||
processNbest(Vocab &vocab, const char *sentid,
|
||||
const char *nbestFile, unsigned maxN, Prob weight,
|
||||
double LMW, double WTW, double postScale,
|
||||
unsigned nScores, double scoreWeights[], const char *scoreFiles[],
|
||||
File &outNbestFile, unsigned tag)
|
||||
{
|
||||
/*
|
||||
* Process nbest list
|
||||
*/
|
||||
NBestList nbestList(vocab, maxN, multiwords ? multiChar : 0, nbestBacktrace);
|
||||
nbestList.debugme(debug);
|
||||
|
||||
/*
|
||||
* Posterior scaling: if not specified (= 0.0) use LMW for
|
||||
* backward compatibility.
|
||||
*/
|
||||
if (postScale == 0.0) {
|
||||
postScale = (LMW == 0.0) ? 1.0 : LMW;
|
||||
}
|
||||
|
||||
if (debug > 0) {
|
||||
cerr << "PROCESSING " << nbestFile
|
||||
<< " maxn = " << maxN
|
||||
<< " weight = " << weight
|
||||
<< " lmw = " << LMW << " wtw = " << WTW
|
||||
<< " scale = " << postScale
|
||||
<< " extras =";
|
||||
for (unsigned i = 0; i < nScores; i ++) {
|
||||
cerr << " " << scoreWeights[i]
|
||||
<< " " << scoreFiles[i];
|
||||
}
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
if (nbestFile) {
|
||||
File input(nbestFile, "r");
|
||||
|
||||
if (!nbestList.read(input)) {
|
||||
cerr << "format error in nbest list\n";
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
File input(stdin);
|
||||
|
||||
if (!nbestList.read(input)) {
|
||||
cerr << "format error in nbest list\n";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Apply AM weight
|
||||
*/
|
||||
if (rescoreAMW != 1.0) {
|
||||
for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
|
||||
nbestList.getHyp(i).acousticScore *= rescoreAMW;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Add extra scores into AM score
|
||||
*/
|
||||
for (unsigned j = 0; j < nScores; j ++) {
|
||||
if (scoreWeights[j] != 0.0) {
|
||||
Array<LogP2> extraScores;
|
||||
|
||||
if (!readScores(scoreFiles[j], nbestList.numHyps(), maxN, extraScores)) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
|
||||
nbestList.getHyp(i).acousticScore += scoreWeights[j] * extraScores[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!noPosteriors) {
|
||||
/*
|
||||
* compute log posteriors
|
||||
*/
|
||||
nbestList.computePosteriors(LMW, WTW, postScale, 1.0, true);
|
||||
LogP logWeight = ProbToLogP(weight);
|
||||
|
||||
File posteriorFile;
|
||||
if (writePosteriors && *writePosteriors) {
|
||||
posteriorFile.reopen(writePosteriors, "a");
|
||||
}
|
||||
|
||||
/*
|
||||
* Encode log posteriors as acoustic scores, for output purposes
|
||||
* Also, dump posterior to a separate file if requested
|
||||
*/
|
||||
for (unsigned i = 0; i < nbestList.numHyps(); i ++) {
|
||||
nbestList.getHyp(i).acousticScore = nbestList.getHyp(i).posterior;
|
||||
nbestList.getHyp(i).languageScore = 0.0;
|
||||
|
||||
nbestList.getHyp(i).totalScore = nbestList.getHyp(i).acousticScore;
|
||||
|
||||
if (writePosteriors && *writePosteriors) {
|
||||
/* from nbest-posteriors.gawk:
|
||||
* print nbest_tag, i, unweighted_logpost >> output_posteriors;
|
||||
*/
|
||||
posteriorFile.fprintf("%d %d %.*lg\n", tag, i+1,
|
||||
Prob_Precision, (double)nbestList.getHyp(i).posterior);
|
||||
}
|
||||
nbestList.getHyp(i).acousticScore += logWeight;
|
||||
}
|
||||
}
|
||||
|
||||
nbestList.write(outNbestFile, writeDecipherNbest);
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char *argv[])
|
||||
{
|
||||
setlocale(LC_CTYPE, "");
|
||||
setlocale(LC_COLLATE, "");
|
||||
|
||||
argc = Opt_Parse(argc, argv, options, Opt_Number(options),
|
||||
OPT_OPTIONS_FIRST);
|
||||
|
||||
/*
|
||||
* Ensure arguments are in pairs (weight, scorefile)
|
||||
*/
|
||||
if ((argc-1) % 2 == 1) {
|
||||
cerr << "number of arguments is not even (alternating weights and score files)\n";
|
||||
exit(2);
|
||||
}
|
||||
unsigned nExtraScores = (argc-1)/2;
|
||||
|
||||
makeArray(double, scoreWeights, nExtraScores);
|
||||
makeArray(const char *, scoreFiles, nExtraScores);
|
||||
|
||||
for (unsigned i = 0; i < nExtraScores; i ++) {
|
||||
if (sscanf(argv[2*i + 1], "%lf", &scoreWeights[i]) != 1) {
|
||||
cerr << "bad score weight " << argv[2*i + 1] << endl;
|
||||
exit(2);
|
||||
}
|
||||
scoreFiles[i] = argv[2*i + 2];
|
||||
}
|
||||
|
||||
if (version) {
|
||||
printVersion(RcsId);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Vocab vocab;
|
||||
|
||||
vocab.toLower() = toLower ? true : false;
|
||||
|
||||
if (vocabFile) {
|
||||
File file(vocabFile, "r");
|
||||
vocab.read(file);
|
||||
}
|
||||
|
||||
if (vocabAliasFile) {
|
||||
File file(vocabAliasFile, "r");
|
||||
vocab.readAliases(file);
|
||||
}
|
||||
|
||||
File outFile(stdout);
|
||||
|
||||
/*
|
||||
* Process single nbest file
|
||||
*/
|
||||
if (rescoreFile) {
|
||||
if (writeNbestFile) {
|
||||
outFile.reopen(writeNbestFile, "w");
|
||||
}
|
||||
|
||||
processNbest(vocab, 0, rescoreFile, maxNbest, posteriorWeight,
|
||||
rescoreLMW, rescoreWTW, posteriorScale,
|
||||
nExtraScores, scoreWeights, scoreFiles,
|
||||
outFile, nbestTag);
|
||||
|
||||
if (writeNbestFile) {
|
||||
outFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Process list of nbest filenames
|
||||
*/
|
||||
if (nbestFiles) {
|
||||
|
||||
File file(nbestFiles, "r");
|
||||
char *line;
|
||||
while ((line = file.getline())) {
|
||||
char *strtok_ptr = NULL;
|
||||
char *fname = MStringTokUtil::strtok_r(line, wordSeparators, &strtok_ptr);
|
||||
if (!fname) continue;
|
||||
|
||||
RefString sentid = idFromFilename(fname);
|
||||
|
||||
/*
|
||||
* Construct score file names from directory path and sentid
|
||||
*/
|
||||
makeArray(char *, scoreFileNames, nExtraScores);
|
||||
|
||||
for (unsigned i = 0; i < nExtraScores; i ++) {
|
||||
scoreFileNames[i] = new char[strlen(scoreFiles[i]) + 1 + strlen(sentid) + strlen(GZIP_SUFFIX) + 1];
|
||||
|
||||
sprintf(scoreFileNames[i], "%s/%s%s", scoreFiles[i], sentid,
|
||||
GZIP_SUFFIX);
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct output file names from directory path and sentid
|
||||
*/
|
||||
makeArray(char, writeNbestName,
|
||||
(writeNbestDir ? strlen(writeNbestDir) : 0) + 1
|
||||
+ strlen(sentid) + strlen(GZIP_SUFFIX) + 1);
|
||||
|
||||
if (writeNbestDir) {
|
||||
sprintf(writeNbestName, "%s/%s%s", writeNbestDir, sentid, GZIP_SUFFIX);
|
||||
|
||||
outFile.reopen(writeNbestName, "r");
|
||||
}
|
||||
|
||||
processNbest(vocab, sentid, fname, maxNbest, posteriorWeight,
|
||||
rescoreLMW, rescoreWTW, posteriorScale,
|
||||
nExtraScores, scoreWeights, (const char **)(char **)scoreFileNames,
|
||||
outFile, nbestTag);
|
||||
|
||||
if (writeNbestDir) {
|
||||
outFile.close();
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < nExtraScores; i ++) {
|
||||
delete [] scoreFileNames[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Process rover control file
|
||||
*/
|
||||
if (roverControlFile) {
|
||||
if (!sentid) {
|
||||
cerr << "no -sentid specified with rover control file\n";
|
||||
exit(2);
|
||||
}
|
||||
|
||||
File roverControl(roverControlFile, "r");
|
||||
|
||||
if (writeNbestFile) {
|
||||
outFile.reopen(writeNbestFile, "w");
|
||||
}
|
||||
|
||||
Array<char *> extraScores;
|
||||
Array<double> extraWeights;
|
||||
unsigned nExtraScores = 0;
|
||||
Prob lastWeight = 1.0;
|
||||
|
||||
const char *scoreSuffix = ".score";
|
||||
|
||||
char *line;
|
||||
|
||||
while ((line = roverControl.getline())) {
|
||||
char scoreDir[256], plus[10];
|
||||
double lmw = rescoreLMW, wtw = rescoreWTW, postScale = posteriorScale;
|
||||
unsigned maxN = maxNbest;
|
||||
Prob weight = posteriorWeight;
|
||||
char weightStr[30];
|
||||
unsigned nparsed;
|
||||
|
||||
/*
|
||||
* nbest-rover:
|
||||
* read dir lmw wtw weight max_nbest scale rest
|
||||
*/
|
||||
if (sscanf(line, "%255s %lf %9s", scoreDir, &lmw, plus) == 3 && strcmp(plus, "+") == 0) {
|
||||
|
||||
extraScores[nExtraScores] = new char[strlen(scoreDir) + 1 + strlen(sentid) + strlen(GZIP_SUFFIX) + 1];
|
||||
sprintf(extraScores[nExtraScores], "%s/%s%s", scoreDir, sentid, GZIP_SUFFIX);
|
||||
|
||||
if (access(extraScores[nExtraScores], R_OK) < 0) {
|
||||
sprintf(extraScores[nExtraScores], "%s/%s", scoreDir, sentid);
|
||||
|
||||
if (access(extraScores[nExtraScores], R_OK) < 0) {
|
||||
roverControl.position() << "no score file for sentid " << sentid << endl;
|
||||
|
||||
for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
|
||||
nExtraScores = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
extraWeights[nExtraScores] = lmw;
|
||||
|
||||
nExtraScores ++;
|
||||
|
||||
} else if ((nparsed = sscanf(line, "%255s %lf %lf %29s %u %lf", scoreDir, &lmw, &wtw, weightStr, &maxN, &postScale)) >= 1) {
|
||||
char *nbestFile = new char[strlen(scoreDir) + 1 + strlen(sentid) + strlen(scoreSuffix) + strlen(GZIP_SUFFIX) + 1];
|
||||
|
||||
sprintf(nbestFile, "%s/%s%s", scoreDir, sentid, GZIP_SUFFIX);
|
||||
if (access(nbestFile, R_OK) < 0) {
|
||||
sprintf(nbestFile, "%s/%s", scoreDir, sentid);
|
||||
|
||||
if (access(nbestFile, R_OK) < 0) {
|
||||
sprintf(nbestFile, "%s/%s%s%s", scoreDir, sentid, scoreSuffix, GZIP_SUFFIX);
|
||||
|
||||
if (access(nbestFile, R_OK) < 0) {
|
||||
sprintf(nbestFile, "%s/%s%s", scoreDir, sentid, scoreSuffix);
|
||||
|
||||
if (access(nbestFile, R_OK) < 0) {
|
||||
roverControl.position() << "no nbest file for sentid " << sentid << endl;
|
||||
|
||||
for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
|
||||
nExtraScores = 0;
|
||||
delete [] nbestFile;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nparsed >= 4 && strcmp(weightStr, "=") == 0) {
|
||||
weight = lastWeight;
|
||||
} else {
|
||||
if (!parseProb(weightStr, weight)) {
|
||||
roverControl.position() << "bad weight value " << weightStr << endl;
|
||||
weight = 0.0;
|
||||
}
|
||||
lastWeight = weight;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* No combine all the files
|
||||
*/
|
||||
processNbest(vocab, sentid, nbestFile, maxN, weight,
|
||||
lmw, wtw, postScale,
|
||||
nExtraScores, extraWeights, (const char **)(char **)extraScores,
|
||||
outFile, nbestTag);
|
||||
|
||||
for (unsigned i = 0; i < nExtraScores; i ++) delete [] extraScores[i];
|
||||
nExtraScores = 0;
|
||||
delete [] nbestFile;
|
||||
|
||||
nbestTag ++;
|
||||
} else {
|
||||
roverControl.position() << "bad format in control file\n";
|
||||
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (writeNbestFile) {
|
||||
outFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
59
language_model/srilm-1.7.3/utils/src/nbest-vocab.gawk
Executable file
59
language_model/srilm-1.7.3/utils/src/nbest-vocab.gawk
Executable file
@@ -0,0 +1,59 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# nbest-vocab --
|
||||
# extract vocabulary used in nbest lists
|
||||
#
|
||||
# usage: nbest-vocab NBEST-FILE ... > VOCAB
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-vocab.gawk,v 1.2 2003/03/18 00:55:07 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
nbestformat = 0;
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList1\.0/ {
|
||||
nbestformat = 1;
|
||||
next;
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList2\.0/ {
|
||||
nbestformat = 2;
|
||||
next;
|
||||
}
|
||||
|
||||
NF > 1 {
|
||||
if (nbestformat == 1) {
|
||||
# for Decipher nbest format 1 we use the aggregate score only
|
||||
for (i = 2; i <= NF; i ++) {
|
||||
is_word[$i] = 1;
|
||||
}
|
||||
} else if (nbestformat == 2) {
|
||||
prev_end_time = -1;
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
start_time = $(i + 3);
|
||||
end_time = $(i + 5);
|
||||
|
||||
# skip tokens that are subsumed by the previous word
|
||||
# (this eliminates phone and state symbols)
|
||||
# XXX: due to a bug in Decipher some state tags have incorrect
|
||||
# timemarks. We filter them based on their token string.
|
||||
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
|
||||
is_word[$i] = 1;
|
||||
|
||||
prev_end_time = end_time;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 4; i <= NF; i ++) {
|
||||
is_word[$i] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
for (word in is_word) {
|
||||
print word;
|
||||
}
|
||||
}
|
||||
|
||||
55
language_model/srilm-1.7.3/utils/src/nbest-words.gawk
Executable file
55
language_model/srilm-1.7.3/utils/src/nbest-words.gawk
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# nbest-words --
|
||||
# extract words only nbest lists
|
||||
#
|
||||
# usage: nbest-words NBEST-FILE ...
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest-words.gawk,v 1.1 2016/04/29 04:00:08 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
nbestformat = 0;
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList1\.0/ {
|
||||
nbestformat = 1;
|
||||
next;
|
||||
}
|
||||
|
||||
$1 ~ /^NBestList2\.0/ {
|
||||
nbestformat = 2;
|
||||
next;
|
||||
}
|
||||
|
||||
NF > 1 {
|
||||
words = "";
|
||||
|
||||
if (nbestformat == 1) {
|
||||
for (i = 2; i <= NF; i ++) {
|
||||
words = words " " $i;
|
||||
}
|
||||
} else if (nbestformat == 2) {
|
||||
prev_end_time = -1;
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
start_time = $(i + 3);
|
||||
end_time = $(i + 5);
|
||||
|
||||
# skip tokens that are subsumed by the previous word
|
||||
# (this eliminates phone and state symbols)
|
||||
# XXX: due to a bug in Decipher some state tags have incorrect
|
||||
# timemarks. We filter them based on their token string.
|
||||
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
|
||||
words = words " " $i
|
||||
prev_end_time = end_time;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 4; i <= NF; i ++) {
|
||||
words = words " " $i;
|
||||
}
|
||||
}
|
||||
print words;
|
||||
}
|
||||
|
||||
|
||||
37
language_model/srilm-1.7.3/utils/src/nbest2-to-nbest1.gawk
Executable file
37
language_model/srilm-1.7.3/utils/src/nbest2-to-nbest1.gawk
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# nbest2-to-nbest1 --
|
||||
# Convert Decipher NBestList2.0 format to NBestList1.0 format
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/nbest2-to-nbest1.gawk,v 1.4 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
magic1 = "NBestList1.0";
|
||||
magic2 = "NBestList2.0";
|
||||
}
|
||||
NR == 1 {
|
||||
if ($0 != magic2) {
|
||||
print "Input not in " magic2 " format" >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
print magic1;
|
||||
next;
|
||||
}
|
||||
{
|
||||
prev_end_time = -1;
|
||||
line = $1;
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
start_time = $(i + 3);
|
||||
end_time = $(i + 5);
|
||||
|
||||
# skip tokens that are subsumed by the previous word
|
||||
# (this eliminates phone and state symbols)
|
||||
# XXX: due to a bug in Decipher some state tags have incorrect
|
||||
# timemarks. We filter them based on their token string.
|
||||
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
|
||||
line = line " " $i;
|
||||
prev_end_time = end_time;
|
||||
}
|
||||
}
|
||||
print line;
|
||||
}
|
||||
23
language_model/srilm-1.7.3/utils/src/pfsg-from-ngram
Executable file
23
language_model/srilm-1.7.3/utils/src/pfsg-from-ngram
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# pfsg-from-ngram --
|
||||
# Convert a bigram or trigram into a Decipher PFSG
|
||||
#
|
||||
# This is a wrapper that takes care of
|
||||
# - eliminating low probability transitions that the recognizer would never use
|
||||
# - renormalizing the LM
|
||||
# - converting to PFSG
|
||||
# - adding pauses between words
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-from-ngram,v 1.3 2000/02/04 00:20:32 stolcke Exp $
|
||||
#
|
||||
|
||||
# get LM from first argument, pass rest to ngram
|
||||
# default LM is stdin
|
||||
lm=${1--}
|
||||
test $# -gt 0 && shift
|
||||
|
||||
ngram -debug 1 -prune-lowprobs -lm "$lm" "$@" -write-lm - | \
|
||||
make-ngram-pfsg | \
|
||||
add-pauses-to-pfsg
|
||||
|
||||
87
language_model/srilm-1.7.3/utils/src/pfsg-to-dot.gawk
Executable file
87
language_model/srilm-1.7.3/utils/src/pfsg-to-dot.gawk
Executable file
@@ -0,0 +1,87 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# pfsg-to-dot --
|
||||
# Generate dot(1) graph description from PFSG
|
||||
#
|
||||
# usage: pfsg-to-dot [show_probs=1] [show_nums=1] file.pfsg > file.dot
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-to-dot.gawk,v 1.5 2003/07/10 21:09:15 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
show_probs = 0;
|
||||
show_logs = 0;
|
||||
show_nums = 0;
|
||||
in_a_pfsg = 0;
|
||||
|
||||
logscale = 10000.5;
|
||||
}
|
||||
|
||||
function bytelog2prob(p) {
|
||||
x = p / logscale;
|
||||
if (x < -7e2) {
|
||||
return 0;
|
||||
} else {
|
||||
return exp(x);
|
||||
}
|
||||
}
|
||||
|
||||
function bytelog2log10(p) {
|
||||
return p / logscale / 2.30258509299404568402;
|
||||
}
|
||||
|
||||
$1 == "name" {
|
||||
name = $2;
|
||||
|
||||
# handle repeated PFSGs in the same file
|
||||
if (in_a_pfsg)
|
||||
print "} digraph \"" name "\" {";
|
||||
else
|
||||
print "digraph \"" name "\" {";
|
||||
|
||||
print "rankdir = LR";
|
||||
dotrans = 0;
|
||||
in_a_pfsg = 1;
|
||||
}
|
||||
|
||||
function node_label(w, i) {
|
||||
if (show_nums) {
|
||||
return w "\\n" i;
|
||||
} else {
|
||||
return w;
|
||||
}
|
||||
}
|
||||
|
||||
$1 == "nodes" {
|
||||
numnodes = $2;
|
||||
for (i = 0; i < numnodes; i ++) {
|
||||
print "\tnode" i " [label=\"" $(i + 3) \
|
||||
(show_nums ? "\\n" i : "") "\"];"
|
||||
}
|
||||
}
|
||||
$1 == "initial" {
|
||||
i = $2;
|
||||
|
||||
# print "\tnode" i " [label=\"START\"];"
|
||||
}
|
||||
$1 == "final" {
|
||||
i = $2;
|
||||
|
||||
# print "\tnode" i " [label=\"END\"];"
|
||||
}
|
||||
$1 == "transitions" {
|
||||
dotrans = 1;
|
||||
next;
|
||||
}
|
||||
dotrans && NF == 3 {
|
||||
from = $1;
|
||||
to = $2;
|
||||
prob = $3;
|
||||
|
||||
print "\tnode" from " -> node" to \
|
||||
(!(show_probs || show_logs) ? "" :
|
||||
" [label=\"" (show_logs ? bytelog2log10(prob) :
|
||||
bytelog2prob(prob)) "\"]") ";"
|
||||
}
|
||||
END {
|
||||
print "}"
|
||||
}
|
||||
146
language_model/srilm-1.7.3/utils/src/pfsg-to-fsm.gawk
Executable file
146
language_model/srilm-1.7.3/utils/src/pfsg-to-fsm.gawk
Executable file
@@ -0,0 +1,146 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# pfsg-to-fsm --
|
||||
# convert a Decipher PFSG to AT&T FSM format
|
||||
#
|
||||
# usage: pfsg-to-fsm [symbolfile=SYMFILE] [symbolic=1] [scale=S] file.pfsg > file.fsm
|
||||
#
|
||||
# symbolic=1 retains output word strings in the fsm file.
|
||||
# symbolfile=SYMFILE dump output symbol table to SYMFILE
|
||||
# (to be used with fsmcompile|fsmdraw|fsmprint -i SYMFILE)
|
||||
# scale=S set transition weight scaling factor to S
|
||||
# (default -1)
|
||||
#
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-to-fsm.gawk,v 1.16 2015-07-03 03:45:38 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
empty_output = "NULL";
|
||||
output_symbols[empty_output] = 0;
|
||||
numoutputs = 1;
|
||||
|
||||
if ("TMPDIR" in ENVIRON) {
|
||||
tmpdir = ENVIRON["TMPDIR"];
|
||||
} else {
|
||||
tmpdir = "/tmp"
|
||||
}
|
||||
|
||||
if ("pid" in PROCINFO) {
|
||||
pid = PROCINFO["pid"];
|
||||
} else {
|
||||
getline pid < "/dev/pid";
|
||||
}
|
||||
tmpfile = tmpdir "/pfsg.tmp" pid;
|
||||
|
||||
# hack to remove tmpfile when killed
|
||||
trap_cmd = ("trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null");
|
||||
print "" | trap_cmd;
|
||||
|
||||
symbolfile = "";
|
||||
symbolic = 0;
|
||||
|
||||
scale = -1; # scaling of transition weights
|
||||
nofinal = 0; # do output final node definition
|
||||
final_output = "";
|
||||
}
|
||||
$1 == "nodes" {
|
||||
numnodes = $2;
|
||||
|
||||
for (i = 0; i < numnodes; i++) {
|
||||
node_output[i] = $(i + 3);
|
||||
|
||||
if (!(node_output[i] in output_symbols)) {
|
||||
output_symbols[node_output[i]] = numoutputs++;
|
||||
}
|
||||
}
|
||||
|
||||
next;
|
||||
}
|
||||
$1 == "initial" {
|
||||
initial_node = $2;
|
||||
|
||||
if (node_output[initial_node] != empty_output) {
|
||||
print "initial node must be NULL" >> "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
next;
|
||||
}
|
||||
$1 == "final" {
|
||||
final_node = $2;
|
||||
|
||||
if (final_output) {
|
||||
node_output[final_node] = final_output;
|
||||
if (!(final_output in output_symbols)) {
|
||||
output_symbols[final_output] = numoutputs++;
|
||||
}
|
||||
}
|
||||
next;
|
||||
}
|
||||
|
||||
function print_trans(from_node, to_node, cost) {
|
||||
if (to_node == final_node && node_output[final_node] == empty_output) {
|
||||
print from_node, scale * cost;
|
||||
} else {
|
||||
# PFSG bytelogs have to be negated to FSM default semiring
|
||||
print from_node, to_node, \
|
||||
(symbolic ? node_output[to_node] : \
|
||||
output_symbols[node_output[to_node]]), \
|
||||
scale * cost;
|
||||
}
|
||||
}
|
||||
|
||||
function print_final() {
|
||||
# if the final node is non-emitting, we don't need to output it
|
||||
# at all (see print_trans above)
|
||||
if (!nofinal && node_output[final_node] != empty_output) {
|
||||
print final_node, 0;
|
||||
}
|
||||
}
|
||||
|
||||
$1 == "transitions" {
|
||||
num_transitions = $2;
|
||||
|
||||
# process the transitions and map them to FSM transitions and
|
||||
# final states.
|
||||
# FSM requires the first transition to be out of the initial state,
|
||||
# so we scan the transitions twice.
|
||||
# The first time, to find the initial transitions, then
|
||||
# to add all the others. Yuck!
|
||||
for (k = 1; k <= num_transitions; k ++) {
|
||||
getline;
|
||||
|
||||
from_node = $1;
|
||||
to_node = $2;
|
||||
cost = $3;
|
||||
|
||||
if (from_node == initial_node) {
|
||||
print_trans(from_node, to_node, cost);
|
||||
} else {
|
||||
print > tmpfile;
|
||||
}
|
||||
}
|
||||
close(tmpfile);
|
||||
|
||||
# output definition of the final node
|
||||
print_final();
|
||||
|
||||
# now process all the non-initial transitions
|
||||
while (getline < tmpfile) {
|
||||
from_node = $1;
|
||||
to_node = $2;
|
||||
cost = $3;
|
||||
|
||||
print_trans(from_node, to_node, cost);
|
||||
}
|
||||
|
||||
next;
|
||||
}
|
||||
|
||||
END {
|
||||
# dump out the symbol table
|
||||
if (symbolfile) {
|
||||
for (s in output_symbols) {
|
||||
print s, output_symbols[s] > symbolfile;
|
||||
}
|
||||
}
|
||||
}
|
||||
35
language_model/srilm-1.7.3/utils/src/pfsg-vocab.gawk
Executable file
35
language_model/srilm-1.7.3/utils/src/pfsg-vocab.gawk
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# pfsg-vocab --
|
||||
# extract vocabulary used in PFSG
|
||||
#
|
||||
# usage: pfsg-vocab PFSG-FILE ... > VOCAB
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/pfsg-vocab.gawk,v 1.1 2003/02/18 18:33:04 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
null = "NULL";
|
||||
}
|
||||
|
||||
$1 == "nodes" {
|
||||
for (i = 3; i <= NF; i ++) {
|
||||
if ($i != null) {
|
||||
is_word[$i] = 1;
|
||||
}
|
||||
}
|
||||
next;
|
||||
}
|
||||
|
||||
$1 == "name" {
|
||||
# sub-pfsg names are not words, and might have been added during the
|
||||
# processing of the nodes list
|
||||
delete is_word[$2];
|
||||
}
|
||||
|
||||
END {
|
||||
for (word in is_word) {
|
||||
print word;
|
||||
}
|
||||
}
|
||||
|
||||
55
language_model/srilm-1.7.3/utils/src/ppl-from-log.gawk
Executable file
55
language_model/srilm-1.7.3/utils/src/ppl-from-log.gawk
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# ppl-from-log --
|
||||
# Recomputes perplexity from (a subset of) the output of
|
||||
#
|
||||
# ngram -debug 2 -ppl
|
||||
#
|
||||
# This is useful if one wants to analyse predictability of certain
|
||||
# words/contexts.
|
||||
#
|
||||
# usage: ppl-from-log [howmany=<numsents>] ppl-log-file
|
||||
#
|
||||
# Copyright (c) 1995, SRI International. All Rights Reserved
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/ppl-from-log.gawk,v 1.4 2014-07-03 05:57:09 stolcke Exp $
|
||||
#
|
||||
function result () {
|
||||
ppl = exp(-sum/(sentences + words - oovs) * M_LN10);
|
||||
printf "file %s: %d sentences, %d words, %d oovs\n", \
|
||||
FILENAME, sentences, words, oovs;
|
||||
printf "%d zeroprobs, logprob= %f, ppl= %f\n", \
|
||||
0, sum , ppl;
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
}
|
||||
|
||||
/^ p\( / {
|
||||
if ($0 ~ /\[ -[Ii]nf|\[ -1\.#INF/) {
|
||||
oovs ++;
|
||||
} else {
|
||||
sum += $10;
|
||||
}
|
||||
if ($2 == "</s>") {
|
||||
sentences ++;
|
||||
} else {
|
||||
words ++;
|
||||
}
|
||||
next;
|
||||
}
|
||||
/ ppl= / {
|
||||
sents ++;
|
||||
if (howmany > 0 && sents == howmany) {
|
||||
result();
|
||||
exit 0;
|
||||
}
|
||||
next;
|
||||
}
|
||||
{
|
||||
next;
|
||||
}
|
||||
END {
|
||||
result();
|
||||
}
|
||||
34
language_model/srilm-1.7.3/utils/src/prettify.gawk
Executable file
34
language_model/srilm-1.7.3/utils/src/prettify.gawk
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# Map words in a text file to zero of more expansions
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/prettify.gawk,v 1.1 2001/03/24 06:41:31 stolcke Exp $
|
||||
#
|
||||
NR == 1 {
|
||||
# read pretty map file
|
||||
if (map) {
|
||||
while ((getline mapline < map) > 0) {
|
||||
npretty = split(mapline, pretty_list);
|
||||
word = pretty_list[1];
|
||||
pretty_map[word] = "";
|
||||
for (i = 2; i <= npretty; i ++) {
|
||||
pretty_map[word] = pretty_map[word] " " pretty_list[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function pretty_up() {
|
||||
for (i = 1; i <= NF; i ++) {
|
||||
if ($i in pretty_map) {
|
||||
$i = pretty_map[$i];
|
||||
}
|
||||
if (multiwords) gsub("_", " ", $i);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
pretty_up();
|
||||
print;
|
||||
}
|
||||
|
||||
141
language_model/srilm-1.7.3/utils/src/rank-vocab.gawk
Executable file
141
language_model/srilm-1.7.3/utils/src/rank-vocab.gawk
Executable file
@@ -0,0 +1,141 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# rank-vocab --
|
||||
# Given K different rankings of candidate vocabularies, and
|
||||
# a held-out optimization unigram count file, optimize the
|
||||
# combined ranking of words
|
||||
#
|
||||
# usage: rank-vocab counts words1 words2 ... worksK
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rank-vocab.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
num_sources = 0;
|
||||
num_output = 0;
|
||||
num_oovs = 0;
|
||||
|
||||
debug = 0;
|
||||
}
|
||||
|
||||
|
||||
ARGIND == 1 {
|
||||
word_count[$1] = $2;
|
||||
|
||||
num_oovs += $2;
|
||||
|
||||
next;
|
||||
}
|
||||
|
||||
ARGIND > 1 {
|
||||
k = ARGIND - 1;
|
||||
num_sources = k;
|
||||
|
||||
num_words[k] ++;
|
||||
|
||||
word_ranked[k, num_words[k]] = $1;
|
||||
next;
|
||||
}
|
||||
|
||||
function dump_words(k) {
|
||||
print "source " k " words:";
|
||||
|
||||
for (i = 1; i <= num_words[k]; i ++) {
|
||||
print i, word_ranked[k,i];
|
||||
}
|
||||
}
|
||||
|
||||
# find the next word from source k that occurs in the test set
|
||||
# return 0 if no more words are available
|
||||
function find_next(k) {
|
||||
for (j = last_chosen[k] + 1; j <= num_words[k]; j ++) {
|
||||
if (word_count[word_ranked[k,j]] > 0) {
|
||||
if (debug) {
|
||||
print "next word rank for source " k ": " j >> "/dev/stderr";
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
print "no more words from source " k >> "/dev/stderr";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
# compute gain (number of OOVs tokens reduced per number of word types added)
|
||||
# by adding the next word from source k
|
||||
function compute_gain(k) {
|
||||
if (next_word[k] == 0) {
|
||||
# no more words in source k, no gain
|
||||
return -1;
|
||||
} else {
|
||||
g = word_count[word_ranked[k,next_word[k]]] / (next_word[k] - last_chosen[k]);
|
||||
if (debug) {
|
||||
print "next gain for source " k " = " g;
|
||||
}
|
||||
return g;
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
# for (k = 1; k <= num_sources; k ++) {
|
||||
# dump_words(k);
|
||||
# }
|
||||
|
||||
for (k = 1; k <= num_sources; k ++) {
|
||||
last_chosen[k] = 0;
|
||||
next_word[k] = find_next(k);
|
||||
gain[k] = compute_gain(k);
|
||||
}
|
||||
|
||||
print "INITIAL OOVS = " num_oovs;
|
||||
|
||||
# add words until no more gain possible (i.e., until all source
|
||||
# words have been used up)
|
||||
while (1) {
|
||||
best_gain = -1;
|
||||
best_source = 0;
|
||||
|
||||
# find next best source to pick word from
|
||||
for (k = 1; k <= num_sources; k ++) {
|
||||
if (gain[k] > best_gain) {
|
||||
best_source = k;
|
||||
best_gain = gain[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (best_gain < 0) break;
|
||||
|
||||
# process all the words from source k up to the one chosen
|
||||
for (i = last_chosen[best_source] + 1; \
|
||||
i <= next_word[best_source]; \
|
||||
i ++) {
|
||||
word_chosen = word_ranked[best_source,i]
|
||||
|
||||
if (debug) {
|
||||
print "source = " best_source \
|
||||
" gain = " best_gain \
|
||||
" word = " word_chosen >> "/dev/stderr";
|
||||
}
|
||||
|
||||
# output the word if it hasn't been already
|
||||
if (!was_output[word_chosen]) {
|
||||
num_output ++;
|
||||
|
||||
num_oovs -= word_count[word_chosen];
|
||||
|
||||
print "RANK " num_output " WORD " word_chosen \
|
||||
" OOVS " num_oovs;
|
||||
|
||||
was_output[word_chosen] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
# update the statistics for the source that was chosen
|
||||
last_chosen[best_source] = next_word[best_source];
|
||||
next_word[best_source] = find_next(best_source);
|
||||
gain[best_source] = compute_gain(best_source);
|
||||
}
|
||||
}
|
||||
|
||||
106
language_model/srilm-1.7.3/utils/src/remove-lowprob-ngrams.gawk
Executable file
106
language_model/srilm-1.7.3/utils/src/remove-lowprob-ngrams.gawk
Executable file
@@ -0,0 +1,106 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# remove-lowprob-ngrams --
|
||||
# Remove ngrams from a backoff LM that have lower prob than their
|
||||
# backoff paths.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/remove-lowprob-ngrams.gawk,v 1.4 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
|
||||
NF == 0 {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
if (order > 3) {
|
||||
print "warning: can only handle bigrams and trigrams" >> "/dev/stderr";
|
||||
}
|
||||
if (order > maxorder && $2 !~ /=0$/) {
|
||||
maxorder = order;
|
||||
}
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
/^\\[0-9]-grams:/ {
|
||||
currorder=substr($0,2,1);
|
||||
print;
|
||||
next;
|
||||
}
|
||||
/^\\/ {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# unigrams
|
||||
#
|
||||
currorder == 1 {
|
||||
word = $2;
|
||||
uni_prob[word] = $1;
|
||||
if (NF > 2) {
|
||||
uni_bow[word] = $3;
|
||||
}
|
||||
print;
|
||||
}
|
||||
|
||||
#
|
||||
# bigrams
|
||||
#
|
||||
currorder == 2 {
|
||||
prob = $1;
|
||||
word1 = $2;
|
||||
word2 = $3;
|
||||
words = $2 " " $3;
|
||||
|
||||
if (maxorder > 2) {
|
||||
bi_prob[words] = prob;
|
||||
if (NF > 3) {
|
||||
bi_bow[words] = $4;
|
||||
}
|
||||
}
|
||||
|
||||
total_bigrams ++;
|
||||
if (uni_bow[word1] + uni_prob[word2] <= prob) {
|
||||
print;
|
||||
} else {
|
||||
removed_bigrams ++;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# trigrams
|
||||
#
|
||||
currorder == 3 {
|
||||
prob = $1;
|
||||
word1 = $2;
|
||||
word2 = $3;
|
||||
word3 = $4;
|
||||
|
||||
if (word2 " " word3 in bi_prob) {
|
||||
backoff_prob = bi_bow[word1 " " word2] + bi_prob[word2 " " word3];
|
||||
} else {
|
||||
backoff_prob = bi_bow[word1 " " word2] + \
|
||||
uni_bow[word2] + uni_prob[word3];
|
||||
}
|
||||
|
||||
total_trigrams ++;
|
||||
if (backoff_prob <= prob) {
|
||||
print;
|
||||
} else {
|
||||
removed_trigrams ++;
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
if (total_bigrams > 0) {
|
||||
printf "%d out of %d bigrams removed\n", \
|
||||
removed_bigrams, total_bigrams >> "/dev/stderr";
|
||||
}
|
||||
if (total_trigrams > 0) {
|
||||
printf "%d out of %d trigrams removed\n", \
|
||||
removed_trigrams, total_trigrams >> "/dev/stderr";
|
||||
}
|
||||
}
|
||||
41
language_model/srilm-1.7.3/utils/src/replace-unk-words.gawk
Executable file
41
language_model/srilm-1.7.3/utils/src/replace-unk-words.gawk
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# replace-unk-words --
|
||||
# replace OOV words with <unk> tag
|
||||
#
|
||||
# usage: replace-unk-words vocab=<vocabfile> text > text-with-unk
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/replace-unk-words.gawk,v 1.1 2013/12/11 08:32:48 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
unk = "<unk>";
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
if (vocab != "") {
|
||||
nwords = 0;
|
||||
while ((getline line < vocab) > 0) {
|
||||
if (split(line, w, " ") > 0) {
|
||||
is_word[w[1]] = 1;
|
||||
nwords += 1;
|
||||
}
|
||||
}
|
||||
close(vocab);
|
||||
print "read " nwords " words" > "/dev/stderr";
|
||||
}
|
||||
|
||||
is_word[unk] = 1;
|
||||
is_word["<s>"] = 1;
|
||||
is_word["</s>"] = 1;
|
||||
}
|
||||
|
||||
{
|
||||
for (i = 1; i <= NF; i ++) {
|
||||
if (!($i in is_word)) {
|
||||
$i = unk;
|
||||
}
|
||||
}
|
||||
print;
|
||||
}
|
||||
|
||||
223
language_model/srilm-1.7.3/utils/src/replace-words-with-classes.gawk
Executable file
223
language_model/srilm-1.7.3/utils/src/replace-words-with-classes.gawk
Executable file
@@ -0,0 +1,223 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# replace-with-words-classes --
|
||||
# replace class expansions with class names
|
||||
#
|
||||
# usage: replace-with-words-classes classes=<classfile> text > text-with-classes
|
||||
# replace-with-words-classes classes=<classfile> have_counts=1 counts \
|
||||
# > counts-with-classes
|
||||
#
|
||||
# optional arguments:
|
||||
# outfile=<file> output file for class expansion counts (default: none)
|
||||
# normalize=<0|1> normalize counts to probabilities (default = 1)
|
||||
# addone=<count> value to add to counts for probability smoothing (1)
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/replace-words-with-classes.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
|
||||
function read_classes(file) {
|
||||
|
||||
num_class_defs = 0;
|
||||
delete num_class_expansions;
|
||||
delete class_expansions;
|
||||
delete class_expansion_probs;
|
||||
|
||||
while ((getline line < file) > 0) {
|
||||
|
||||
n = split(line, a);
|
||||
if (n == 0) continue;
|
||||
|
||||
class = a[1];
|
||||
num_exp = ++ num_class_expansions[class];
|
||||
|
||||
if (a[2] ~ /^[-+0-9.][-+0-9e.]*$/) {
|
||||
prob = a[2];
|
||||
i = 3;
|
||||
} else {
|
||||
prob = "";
|
||||
i = 2;
|
||||
}
|
||||
|
||||
expansion = a[i];
|
||||
for (i++; i <= n; i++) {
|
||||
expansion = expansion " " a[i];
|
||||
}
|
||||
|
||||
class_expansions[class " " num_exp] = expansion;
|
||||
if (prob != "") {
|
||||
class_expansion_probs[class " " num_exp] = prob;
|
||||
}
|
||||
num_class_defs ++;
|
||||
}
|
||||
|
||||
print "read " num_class_defs " class expansions" >> "/dev/stderr";
|
||||
|
||||
# assign default expansion probs
|
||||
|
||||
for (class in num_class_expansions) {
|
||||
|
||||
num_exp = num_class_expansions[class];
|
||||
|
||||
for (i = 1; i <= num_exp; i ++) {
|
||||
if (class_expansion_probs[class " " i] == "") {
|
||||
class_expansion_probs[class " " i] = 1/num_exp;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
##############################################################################
|
||||
|
||||
function add_to_prefix_tree(class, expansion, prob) {
|
||||
|
||||
nwords = split(expansion, w);
|
||||
|
||||
node = 0;
|
||||
|
||||
for (k = 1; k <= nwords; k ++) {
|
||||
next_node = tree[node " " w[k]];
|
||||
|
||||
if (!next_node) {
|
||||
next_node = ++num_nodes;
|
||||
tree[node " " w[k]] = next_node;
|
||||
}
|
||||
|
||||
node = next_node;
|
||||
}
|
||||
|
||||
if (!(node in node_class)) {
|
||||
node_class[node] = class;
|
||||
node_prob[node] = prob;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
normalize = 1;
|
||||
addone = 1;
|
||||
partial = 0;
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
if (classes) {
|
||||
read_classes(classes);
|
||||
close(classes);
|
||||
} else {
|
||||
print "no classes file specified" >> "/dev/stderr";
|
||||
}
|
||||
|
||||
for (class in num_class_expansions) {
|
||||
for (i = 1; i <= num_class_expansions[class]; i ++) {
|
||||
class_expansion_node[class " " i] = \
|
||||
add_to_prefix_tree(class, class_expansions[class " " i], \
|
||||
class_expansion_probs[class " " i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
output = "";
|
||||
next_pos = 1;
|
||||
|
||||
|
||||
# partial option: multiple spaces block multiword replacement
|
||||
if (partial) {
|
||||
gsub("[ ][ ]*[ ]", " | ");
|
||||
}
|
||||
|
||||
#
|
||||
# handle ngram counts by simply leaving the count value alone
|
||||
# and doing substitution on the ngram itself.
|
||||
#
|
||||
if (have_counts) {
|
||||
max_pos = NF - 1;
|
||||
} else {
|
||||
max_pos = NF;
|
||||
}
|
||||
|
||||
while (next_pos <= max_pos) {
|
||||
|
||||
class = "";
|
||||
prob = 0;
|
||||
num_exp_words = 0;
|
||||
|
||||
# search for largest class expansion starting at current position
|
||||
node = 0;
|
||||
k = 0;
|
||||
while (1) {
|
||||
node = tree[node " " $(next_pos + k)];
|
||||
|
||||
if (node) {
|
||||
if (node in node_class) {
|
||||
# we have found a complete expansion, record its class
|
||||
class = node_class[node];
|
||||
class_node = node;
|
||||
prob = node_prob[prob];
|
||||
num_exp_words = k + 1;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
k ++;
|
||||
}
|
||||
|
||||
if (next_pos == 1) {
|
||||
space = "";
|
||||
} else {
|
||||
space = " ";
|
||||
}
|
||||
|
||||
if (!class) {
|
||||
output = output space $next_pos;
|
||||
next_pos ++;
|
||||
} else {
|
||||
output = output space class;
|
||||
next_pos += num_exp_words;
|
||||
|
||||
node_count[class_node] ++;
|
||||
class_count[class] ++;
|
||||
}
|
||||
}
|
||||
|
||||
# partial option: multiple spaces block multiword replacement
|
||||
if (partial) {
|
||||
gsub(" [|] ", " ", output);
|
||||
sub("^[|]", " ", output);
|
||||
sub("[|]$", " ", output);
|
||||
}
|
||||
|
||||
if (have_counts) {
|
||||
print output, $NF;
|
||||
} else {
|
||||
print output;
|
||||
}
|
||||
}
|
||||
|
||||
function estimate(count, total, N) {
|
||||
denom = total + N *addone;
|
||||
|
||||
if (denom == 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return (count + addone)/denom;
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
if (outfile) {
|
||||
for (class in num_class_expansions) {
|
||||
for (i = 1; i <= num_class_expansions[class]; i ++) {
|
||||
nc = node_count[class_expansion_node[class " " i]] + 0;
|
||||
print class, \
|
||||
normalize ? \
|
||||
estimate(nc, class_count[class], \
|
||||
num_class_expansions[class]) :
|
||||
nc, \
|
||||
class_expansions[class " " i] > outfile;
|
||||
}
|
||||
}
|
||||
close(outfile);
|
||||
}
|
||||
}
|
||||
|
||||
70
language_model/srilm-1.7.3/utils/src/rescore-acoustic
Executable file
70
language_model/srilm-1.7.3/utils/src/rescore-acoustic
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# rescore-acoustic --
|
||||
# Replace acoustic Nbest scores with a weighted combination of
|
||||
# old and new acoustic scores
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-acoustic,v 1.8 2015-07-03 03:45:39 stolcke Exp $
|
||||
#
|
||||
|
||||
if [ $# -lt 5 ]; then
|
||||
echo "usage: $0 old-nbest-dir old-ac-weight new-score-dir1 new-ac-weight1 new-score-dir2 new-ac-weight2 ... new-nbest-dir [max-nbest]" >&2
|
||||
echo " or $0 old-file-list old-ac-weight new-score-dir1 new-ac-weight1 new-score-dir2 new-ac-weight2 ... new-nbest-dir [max-nbest]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
old_nbest=${1}
|
||||
old_acw=${2}
|
||||
shift; shift
|
||||
|
||||
new_scores=
|
||||
new_acw=
|
||||
while [ $# -ge 3 ]
|
||||
do
|
||||
new_scores="$new_scores $1"
|
||||
new_acw="$new_acw $2"
|
||||
shift; shift
|
||||
done
|
||||
new_nbest=${1}
|
||||
max_nbest=${2-0}
|
||||
|
||||
set -e
|
||||
|
||||
tmpdir=${TMPDIR-/tmp}
|
||||
join1="$tmpdir/join1_$$"
|
||||
join2="$tmpdir/join2_$$"
|
||||
trap "rm -f $join1 $join2" 0 1 2 15
|
||||
|
||||
echo "generating sentids ..." >&2
|
||||
if [ -d $old_nbest ]; then
|
||||
find $old_nbest/. -follow -type f -print
|
||||
else
|
||||
cat $old_nbest
|
||||
fi | \
|
||||
sed -e 's,.*,& &,' -e 's,[^ ]*/,,' -e 's,\.gz , ,' -e 's,\.score , ,' | \
|
||||
sort -k 1,1 > $join1
|
||||
|
||||
echo "`wc -l < $join1` utterances" >&2
|
||||
|
||||
for d in $new_scores
|
||||
do
|
||||
echo "joining $d ..." >&2
|
||||
find $d/. -follow -type f -print | \
|
||||
sed -e 's,.*,& &,' -e 's,[^ ]*/,,' -e 's,\.gz , ,' |\
|
||||
sort -k 1,1 | \
|
||||
/usr/local/gnu/bin/join $join1 - > $join2
|
||||
mv $join2 $join1
|
||||
done
|
||||
echo "`wc -l < $join1` utterances after joining" >&2
|
||||
|
||||
mkdir -p $new_nbest
|
||||
|
||||
cat $join1 | \
|
||||
while read sentid scorefiles
|
||||
do
|
||||
echo $sentid >&2
|
||||
combine-acoustic-scores -v "weights=$old_acw $new_acw" \
|
||||
-v max_nbest=$max_nbest $scorefiles | \
|
||||
gzip > $new_nbest/$sentid.score.gz
|
||||
done
|
||||
|
||||
466
language_model/srilm-1.7.3/utils/src/rescore-decipher
Executable file
466
language_model/srilm-1.7.3/utils/src/rescore-decipher
Executable file
@@ -0,0 +1,466 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# rescore-nbest --
|
||||
# generate scores from Decipher(TM) n-best lists
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-decipher,v 1.40 2017/07/20 05:43:59 stolcke Exp $
|
||||
#
|
||||
|
||||
bytelog=0
|
||||
nodecipherlm=0
|
||||
multiwords=0
|
||||
norescore=0
|
||||
decipher_lmw=8
|
||||
decipher_wtw=0
|
||||
lm_only=0
|
||||
pretty_file=
|
||||
filter_command=
|
||||
limit_vocab=0
|
||||
vocab_aliases=
|
||||
fast_rescore=
|
||||
ngram_tool=ngram
|
||||
ngram_options=
|
||||
count_oovs=0
|
||||
rescore_option=-rescore
|
||||
multichar=_
|
||||
tmpdir=${TMPDIR-/tmp}
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case "$1" in
|
||||
-bytelog)
|
||||
bytelog=1
|
||||
;;
|
||||
-nodecipherlm)
|
||||
nodecipherlm=1
|
||||
;;
|
||||
-multiwords)
|
||||
multiwords=1
|
||||
mw_option=-multiwords
|
||||
smw_option=-split-multiwords
|
||||
;;
|
||||
-multi-char)
|
||||
multichar="$2"; shift
|
||||
;;
|
||||
-norescore)
|
||||
norescore=1
|
||||
;;
|
||||
-lm-only)
|
||||
lm_only=1
|
||||
;;
|
||||
-count-oovs)
|
||||
count_oovs=1
|
||||
rescore_option="-debug 1 -ppl"
|
||||
;;
|
||||
-pretty)
|
||||
pretty_file="$2"; shift
|
||||
;;
|
||||
-ngram-tool)
|
||||
ngram_tool="$2"; shift
|
||||
;;
|
||||
-filter)
|
||||
filter_command="$2"; shift
|
||||
;;
|
||||
-limit-vocab)
|
||||
limit_vocab=1
|
||||
;;
|
||||
-vocab-aliases)
|
||||
vocab_aliases="$2"; shift
|
||||
;;
|
||||
-fast)
|
||||
fast_rescore=1
|
||||
;;
|
||||
-*) echo "$0: unknown option $1" >&2
|
||||
exit 2 ;;
|
||||
*) break
|
||||
;;
|
||||
esac
|
||||
|
||||
shift
|
||||
done
|
||||
|
||||
if [ $# -lt 3 ]; then
|
||||
{
|
||||
echo "usage: $0 [-bytelog] [-nodecipherlm] [-multiwords] [-multi-char C] [-norescore] [-lm-only] [-count-oovs] [-pretty map] [-ngram-tool pgm] [-filter command] [-limit-vocab] [-vocab-aliases map] [-fast] nbest-file-list score-dir lm-options ..." >&2
|
||||
echo "where"
|
||||
echo " -bytelog produces bytelog scaled scores"
|
||||
echo " -nodecipherlm avoids Decipher LM score computation"
|
||||
echo " -multiwords expand multiwords into constituent words"
|
||||
echo " -multi-char C redefine multiword separator character"
|
||||
echo " -norescore don't rescore LM, just extract scores"
|
||||
echo " -lm-only output no N-best lists, only LM scores"
|
||||
echo " -count-oovs output number of OOV and zeroprob words"
|
||||
echo " -pretty map word mapping file"
|
||||
echo " -ngram-tool pgm use pgm for LM evaluation"
|
||||
echo " -filter command text filter to apply to N-best hyps"
|
||||
echo " -limit-vocab limit LM loading to used vocabulary"
|
||||
echo " -vocab-aliases map map vocabulary in LM evaluation"
|
||||
echo " -fast fast rescoring mode, no text filtering allowed"
|
||||
} >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
filelist="$1"
|
||||
scoredir="$2"
|
||||
shift; shift
|
||||
|
||||
if [ ! -d $scoredir ]; then
|
||||
mkdir $scoredir
|
||||
fi
|
||||
|
||||
# when not rescoring need to get decipher lmw and wtw from remaining options
|
||||
if [ $norescore -gt 0 ]; then
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case "$1" in
|
||||
-decipher-lmw)
|
||||
decipher_lmw=$2
|
||||
shift
|
||||
;;
|
||||
-decipher-wtw)
|
||||
decipher_wtw=$2
|
||||
shift
|
||||
;;
|
||||
*) shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $norescore -eq 0 -a $limit_vocab -gt 0 ]; then
|
||||
#
|
||||
# limit LM vocabulary to words found in the nbest lists
|
||||
#
|
||||
|
||||
nbestvocab="$tmpdir/$$nbest.vocab"
|
||||
trap "rm -f $nbestvocab; exit" 0 1 2 15
|
||||
|
||||
# generate nbest vocabulary
|
||||
if [ -z "$filter_command" ]; then
|
||||
nbest-lattice -no-rescore -no-reorder \
|
||||
$mw_option -multi-char "$multichar" \
|
||||
-nbest-files "$filelist" -write-vocab $nbestvocab
|
||||
else
|
||||
cat "$filelist" | xargs gzip -dcf | \
|
||||
eval "$filter_command" | \
|
||||
ngram -rescore - -null -no-reorder \
|
||||
$smw_options -multi-char "$multichar" \
|
||||
-write-vocab $nbestvocab >/dev/null
|
||||
fi
|
||||
|
||||
# tell ngram to use this vocab
|
||||
ngram_options="-limit-vocab -vocab $nbestvocab"
|
||||
|
||||
fi
|
||||
|
||||
if [ $norescore -eq 0 -a -n "$vocab_aliases" ]; then
|
||||
if [ $limit_vocab -gt 0 ]; then
|
||||
nbestvocabalias="$tmpdir/$$nbest.vocabalias"
|
||||
trap "rm -f $nbestvocab $nbestvocabalias; exit" 0 1 2 15
|
||||
|
||||
sort -k 2,2 $vocab_aliases | \
|
||||
join -1 2 -o 1.1,1.2 - $nbestvocab > $nbestvocabalias
|
||||
|
||||
# tell ngram to use these vocab-aliases
|
||||
ngram_options="$ngram_options -vocab-aliases $nbestvocabalias"
|
||||
else
|
||||
# tell ngram to use this vocab-alias
|
||||
ngram_options="-vocab-aliases $vocab_aliases"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$fast_rescore" ]; then
|
||||
|
||||
#
|
||||
# Fast rescoring mode:
|
||||
# Hand N-best lists directly to ngram. No text filtering is supported
|
||||
#
|
||||
|
||||
if [ -n "$pretty_file" -o -n "$filter_command" -o $lm_only -gt 0 -o $count_oovs -gt 0 ]
|
||||
then
|
||||
echo "Text filtering, -lm-only, and -count-oovs not supported with -fast" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ $nodecipherlm -eq 0 ]; then
|
||||
echo "Must use -nodecipherlm with -fast" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ $norescore -gt 0 ]; then
|
||||
nbest-lattice -no-rescore -no-reorder $mw_option \
|
||||
-nbest-files "$filelist" \
|
||||
-write-nbest-dir "$scoredir"
|
||||
else
|
||||
if [ "$multiwords" -gt 0 ]; then
|
||||
mw_option=-split-multiwords
|
||||
fi
|
||||
$ngram_tool \
|
||||
-no-reorder $mw_option -multi-char "$multichar" \
|
||||
-nbest-files "$filelist" \
|
||||
-write-nbest-dir "$scoredir" \
|
||||
-rescore-lmw 1 -rescore-wtw 1 \
|
||||
$ngram_options "$@"
|
||||
fi
|
||||
|
||||
else # fast_rescore
|
||||
|
||||
#
|
||||
# General rescoring mode:
|
||||
# Concatenate hyps for all nbest list, record number of hyps for
|
||||
# each file in the output stream
|
||||
# Feed to ngram -rescore (using lm-options)
|
||||
# or using -ppl for counting OOVs
|
||||
# Parse ngram output into lm scores and deposit into target files
|
||||
#
|
||||
|
||||
escape="***FILE:"
|
||||
|
||||
cat $filelist | ( \
|
||||
while read filename rest; do
|
||||
case $filename in
|
||||
# preserve LMstate labels in the file list and pass them to ngram
|
||||
"<LMstate>") echo $filename $rest
|
||||
continue ;;
|
||||
esac
|
||||
gzip -dcf $filename | \
|
||||
${GAWK-gawk} '
|
||||
BEGIN {
|
||||
filename = "";
|
||||
numhyps = 0;
|
||||
nbestformat = 0;
|
||||
|
||||
# constants
|
||||
bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;
|
||||
pause = "-pau-";
|
||||
}
|
||||
|
||||
function bytelog2log10(x) {
|
||||
return x / bytelogscale;
|
||||
}
|
||||
|
||||
NR == 1 {
|
||||
sentid = filename;
|
||||
sub("^.*/", "", sentid);
|
||||
sub("\\.gz$", "", sentid);
|
||||
sub("\\.Z$", "", sentid);
|
||||
sub("\\.score$", "", sentid);
|
||||
sub("\\.wv$", "", sentid);
|
||||
sub("\\.wav$", "", sentid);
|
||||
sub("\\.wav_cep$", "", sentid);
|
||||
|
||||
# read pretty map file
|
||||
if (pretty_file) {
|
||||
while ((getline mapline < pretty_file) > 0) {
|
||||
npretty = split(mapline, pretty_list);
|
||||
word = pretty_list[1];
|
||||
pretty_map[word] = "";
|
||||
for (i = 2; i <= npretty; i ++) {
|
||||
pretty_map[word] = pretty_map[word] " " pretty_list[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print escape, sentid;
|
||||
}
|
||||
|
||||
function pretty_up(start) {
|
||||
for (i = start; i <= NF; i ++) {
|
||||
if ($i in pretty_map) {
|
||||
$i = pretty_map[$i];
|
||||
}
|
||||
if (multiwords) gsub(multichar, " ", $i);
|
||||
}
|
||||
}
|
||||
|
||||
/^NBestList1\.0/ {
|
||||
nbestformat = 1;
|
||||
if (nodecipherlm) {
|
||||
printf "%s: -nodecipherlm ineffective for NBestList1.0\n", filename > "/dev/stderr" ;
|
||||
}
|
||||
next;
|
||||
}
|
||||
/^NBestList2\.0/ {
|
||||
nbestformat = 2;
|
||||
next;
|
||||
}
|
||||
{
|
||||
numhyps ++;
|
||||
if (nbestformat == 0) {
|
||||
pretty_up(4);
|
||||
if (count_oovs) {
|
||||
# output only the words, add <s> to handle empty hyps
|
||||
$1 = $2 = $3 = "";
|
||||
print "<s>", $0;
|
||||
} else {
|
||||
print;
|
||||
}
|
||||
} else if (nbestformat == 1) {
|
||||
pretty_up(2);
|
||||
|
||||
if (count_oovs) {
|
||||
# output only the words, add <s> to handle empty hyps
|
||||
$1 = "";
|
||||
print "<s>", $0;
|
||||
} else if (norescore) {
|
||||
# convert to SRILM format
|
||||
score = substr($1,2,length($1)-2);
|
||||
$1 = "";
|
||||
print bytelog2log10(score), 0, 0, $0;
|
||||
} else {
|
||||
# keep Decipher format
|
||||
print;
|
||||
}
|
||||
} else if (nbestformat == 2) {
|
||||
score = substr($1,2,length($1)-2);
|
||||
|
||||
# compute total AC and LM scores
|
||||
lm_score = 0;
|
||||
num_words = 0;
|
||||
num_pauses = 0;
|
||||
|
||||
words = "";
|
||||
prev_end_time = -1;
|
||||
for (i = 2; i <= NF; i += 11) {
|
||||
start_time = $(i + 3);
|
||||
end_time = $(i + 5);
|
||||
|
||||
# skip tokens that are subsumed by the previous word
|
||||
# (this eliminates phone and state symbols)
|
||||
# XXX: due to a bug in Decipher some state tags have incorrect
|
||||
# timemarks. We filter them based on their token string.
|
||||
if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {
|
||||
words = words " " $i;
|
||||
|
||||
num_words ++;
|
||||
if ($i == pause) num_pauses ++;
|
||||
|
||||
lm_score += $(i + 7);
|
||||
|
||||
prev_end_time = end_time;
|
||||
}
|
||||
}
|
||||
|
||||
$0 = $1 " " words;
|
||||
|
||||
pretty_up(2);
|
||||
|
||||
# Compute AC score from total and lm scores. This takes into
|
||||
# account that the recognizer might sum scores of equivalent hyps
|
||||
# (e.g., those differing only in pauses or pronunciations) and
|
||||
# reflect the summing in the total score, but not in the word AC
|
||||
# scores.
|
||||
ac_score = score - lm_score;
|
||||
|
||||
if (count_oovs) {
|
||||
# output only the words, add <s> to handle empty hyps
|
||||
$1 = "";
|
||||
print "<s>", $0;
|
||||
} else if (norescore) {
|
||||
# convert to SRILM nbest format
|
||||
# NOTES:
|
||||
# - subtract Decipher WTW (including for pauses!)
|
||||
# - compute number of words WITHOUT pauses for output
|
||||
$1 = "";
|
||||
print bytelog2log10(ac_score), \
|
||||
bytelog2log10(lm_score/decipher_lmw) - \
|
||||
numwords * decipher_wtw, \
|
||||
split(words, dummy) - num_pauses, $0;
|
||||
} else if (nodecipherlm) {
|
||||
# output only acoustic score in Decipher format
|
||||
$1 = "(" ac_score ")";
|
||||
print;
|
||||
} else {
|
||||
# output combined score in Decipher format
|
||||
print;
|
||||
}
|
||||
}
|
||||
}
|
||||
END {
|
||||
if (numhyps == 0) {
|
||||
print "WARNING: nbest list " filename " is empty" \
|
||||
> "/dev/stderr" ;
|
||||
}
|
||||
}
|
||||
' filename=$filename escape="$escape" count_oovs=$count_oovs \
|
||||
nodecipherlm=$nodecipherlm multiwords=$multiwords \
|
||||
multichar="$multichar" pretty_file="$pretty_file" \
|
||||
norescore=$norescore decipher_lmw=$decipher_lmw decipher_wtw=$decipher_wtw
|
||||
done
|
||||
) | \
|
||||
if [ $norescore -gt 0 -a -z "$filter_command" ]; then
|
||||
# no rescoring and no filtering
|
||||
cat
|
||||
elif [ $norescore -gt 0 ]; then
|
||||
# no resoring, but filter hyps
|
||||
eval "$filter_command"
|
||||
elif [ -z "$filter_command" ]; then
|
||||
# standard rescoring without filtering
|
||||
$ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \
|
||||
-escape "$escape " $ngram_options "$@"
|
||||
else
|
||||
# rescoring with filtering
|
||||
eval "$filter_command" | \
|
||||
$ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \
|
||||
-escape "$escape " $ngram_options "$@"
|
||||
fi | \
|
||||
${GAWK-gawk} -v bytelog=$bytelog '
|
||||
BEGIN {
|
||||
currentfile = "";
|
||||
scoredir = "";
|
||||
scorefile = "";
|
||||
numhyps = 0;
|
||||
bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;
|
||||
}
|
||||
$1 == escape {
|
||||
if (currentfile) {
|
||||
close(scorefile);
|
||||
}
|
||||
currentfile = $2;
|
||||
sub("
|
||||
$", "", currentfile);
|
||||
if (!lm_only && !count_oovs) {
|
||||
# backward compatibility
|
||||
currentfile = currentfile ".score";
|
||||
}
|
||||
scorefile = "gzip > " scoredir "/" currentfile ".gz";
|
||||
printf "processing hyps for %s\n", currentfile \
|
||||
> "/dev/stderr" ;
|
||||
hypno = 0;
|
||||
next;
|
||||
}
|
||||
# parse ngram -ppl output to get OOV (including zeroprobs) count
|
||||
count_oovs && $6 == "OOVs" {
|
||||
num_oovs = $5;
|
||||
next;
|
||||
}
|
||||
count_oovs && $2 == "zeroprobs," {
|
||||
num_oovs += $1;
|
||||
print num_oovs | scorefile;
|
||||
next;
|
||||
}
|
||||
# process ngram -rescore output
|
||||
!count_oovs {
|
||||
if ($2 ~ /NaN/) {
|
||||
print "WARNING: LM score in nbest list " currentfile " is NaN" \
|
||||
> "/dev/stderr" ;
|
||||
$2 = -100000;
|
||||
}
|
||||
|
||||
if (bytelog) {
|
||||
$1 = $1 * bytelogscale;
|
||||
$2 = $2 * bytelogscale;
|
||||
}
|
||||
if (lm_only) {
|
||||
print $2 | scorefile;
|
||||
} else {
|
||||
print | scorefile;
|
||||
}
|
||||
}
|
||||
END {
|
||||
if (currentfile) {
|
||||
close(scorefile);
|
||||
}
|
||||
}
|
||||
' scoredir=$scoredir escape="$escape" bytelog=$bytelog lm_only=$lm_only count_oovs=$count_oovs
|
||||
|
||||
43
language_model/srilm-1.7.3/utils/src/rescore-minimize-wer
Executable file
43
language_model/srilm-1.7.3/utils/src/rescore-minimize-wer
Executable file
@@ -0,0 +1,43 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# rescore-minimize-wer --
|
||||
# minimize posterior expected WER in an nbest-list
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-minimize-wer,v 1.7 2013/03/09 07:13:01 stolcke Exp $
|
||||
#
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "usage: $0: score-dir [lmw [wtw [max-nbest]]]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
scoredir="$1"
|
||||
lmweight="${2-8.0}"
|
||||
wtweight="${3-0.0}"
|
||||
maxnbest="${4-10}"
|
||||
|
||||
find $scoredir -follow -type f \( -name \*.score -o \
|
||||
-name \*.score.Z -o \
|
||||
-name \*.score.gz \) \
|
||||
-print | sort | \
|
||||
while read file
|
||||
do
|
||||
case $file in
|
||||
*.Z) cat="gzip -dcf"
|
||||
sentid=`basename $file .score.Z`
|
||||
;;
|
||||
*.gz) cat="gzip -dcf"
|
||||
sentid=`basename $file .score.gz`
|
||||
;;
|
||||
*) cat=cat
|
||||
sentid=`basename $file .score`
|
||||
;;
|
||||
esac
|
||||
${GAWK-gawk} -v sentid="$sentid" 'BEGIN { printf "%s ", sentid }'
|
||||
$cat $file | \
|
||||
sed -e 's,-pau-,,g' -e 's,\[[^]]*\],,g' | \
|
||||
nbest-lattice -wer -debug 1 -rescore - \
|
||||
-rescore-lmw $lmweight -rescore-wtw $wtweight \
|
||||
-max-rescore $maxnbest
|
||||
done
|
||||
|
||||
77
language_model/srilm-1.7.3/utils/src/rescore-nbest
Executable file
77
language_model/srilm-1.7.3/utils/src/rescore-nbest
Executable file
@@ -0,0 +1,77 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# rescore-nbest --
|
||||
# output LM scores for nbest lists
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-nbest,v 1.3 1996/03/28 19:12:01 stolcke Exp $
|
||||
#
|
||||
|
||||
if [ $# -lt 3 ]; then
|
||||
echo "usage: $0: nbest-file-list score-dir lm-options ..." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
filelist="$1"
|
||||
scoredir="$2"
|
||||
shift; shift
|
||||
|
||||
#
|
||||
# STRATEGY:
|
||||
# Concatenate hyps for all nbest list, record number of hyps for
|
||||
# each file in the output stream
|
||||
# Strip hyp ids, !SENT_START, !SENT_END
|
||||
# Feed to ngram -ppl (using lm-options)
|
||||
# Parse ngram output into lm scores and deposit into target files
|
||||
#
|
||||
|
||||
escape="***FILE:"
|
||||
|
||||
cat $filelist | ( \
|
||||
while read filename; do
|
||||
set -e
|
||||
numhyps=`wc -l < $filename`
|
||||
echo "$escape `basename $filename .trans`.score $numhyps"
|
||||
sed \
|
||||
-e 's/^ *([^ ]*) //' \
|
||||
-e 's/!SENT_START //' \
|
||||
-e 's/!SENT_END //' \
|
||||
$filename
|
||||
done
|
||||
) | \
|
||||
ngram -debug 1 -ppl - -escape "$escape " "$@" | \
|
||||
gawk '
|
||||
BEGIN {
|
||||
currentfile = "";
|
||||
scoredir = "";
|
||||
scorefile = "";
|
||||
numhyps = 0;
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
}
|
||||
$1 == escape {
|
||||
if (currentfile) {
|
||||
close(scorefile);
|
||||
}
|
||||
currentfile = $2;
|
||||
scorefile = scoredir "/" currentfile;
|
||||
numhyps = $3;
|
||||
printf "processing %d hyps for %s\n", numhyps, currentfile;
|
||||
hypno = 0;
|
||||
next;
|
||||
}
|
||||
/logprob=/ {
|
||||
logprob = $4;
|
||||
|
||||
hypno ++;
|
||||
|
||||
# rescale LM scores to natural logs
|
||||
printf "%g\n", logprob * M_LN10 > scorefile;
|
||||
|
||||
next;
|
||||
}
|
||||
END {
|
||||
if (currentfile) {
|
||||
close(scorefile);
|
||||
}
|
||||
}
|
||||
' scoredir=$scoredir escape="$escape"
|
||||
|
||||
134
language_model/srilm-1.7.3/utils/src/rescore-reweight
Executable file
134
language_model/srilm-1.7.3/utils/src/rescore-reweight
Executable file
@@ -0,0 +1,134 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# rescore-reweight
|
||||
# reweight nbest-list scores and select top hyps
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rescore-reweight,v 1.20 2013/03/09 07:13:01 stolcke Exp $
|
||||
#
|
||||
|
||||
multiwords=0
|
||||
multichar=_
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case "$1" in
|
||||
-multiwords)
|
||||
multiwords=1
|
||||
;;
|
||||
-multi-char)
|
||||
multichar="$2"
|
||||
shift
|
||||
;;
|
||||
-*) echo "$0: unknown option $1" >&2
|
||||
exit 2 ;;
|
||||
*) break
|
||||
;;
|
||||
esac
|
||||
|
||||
shift
|
||||
done
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "usage: $0 [-multiwords] [-multi-char C] score-dir [lmw [wtw [scoredir weight ...] [max-nbest]]]" >&2
|
||||
echo " or $0 [-multiwords] [-multi-char C] file-list [lmw [wtw [scoredir weight ...] [max-nbest]]]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
scoredir="$1"
|
||||
shift
|
||||
|
||||
lmweight="${1-8.0}"
|
||||
[ $# -gt 0 ] && shift
|
||||
wtweight="${1-0.0}"
|
||||
[ $# -gt 0 ] && shift
|
||||
|
||||
extra_scoredirs=
|
||||
extra_weights=
|
||||
while [ $# -gt 1 ]; do
|
||||
extra_scoredirs="$extra_scoredirs $1"
|
||||
extra_weights="$extra_weights $2"
|
||||
shift; shift
|
||||
done
|
||||
|
||||
maxnbest="${1-100000}"
|
||||
|
||||
# prevent "broken pipe" from $cat below when maxnbest truncates list
|
||||
trap '' 13
|
||||
|
||||
if [ -d $scoredir ]; then
|
||||
find $scoredir -follow -type f \( -name \*.score -o \
|
||||
-name \*.score.Z -o \
|
||||
-name \*.gz \) \
|
||||
-print | sort
|
||||
else
|
||||
cat $scoredir
|
||||
fi | \
|
||||
while read file
|
||||
do
|
||||
case $file in
|
||||
*.score.Z) cat="gzip -dcf"
|
||||
sentid=`basename $file .score.Z`
|
||||
;;
|
||||
*.score.gz) cat="gzip -dcf"
|
||||
sentid=`basename $file .score.gz`
|
||||
;;
|
||||
*.score) cat=cat
|
||||
sentid=`basename $file .score`
|
||||
;;
|
||||
*) # use nbest-lattice to convert Decipher nbest format
|
||||
cat="nbest-lattice -no-rescore -no-reorder -keep-noise -write-nbest - -nbest"
|
||||
sentid=`basename $file .gz`
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -z "$extra_scoredirs" ]; then
|
||||
$cat $file
|
||||
else
|
||||
extra_scores=
|
||||
for dir in $extra_scoredirs
|
||||
do
|
||||
if [ -f $dir/$sentid.gz ]; then
|
||||
extra_scores="$extra_scores $dir/$sentid.gz"
|
||||
elif [ -f $dir/$sentid ]; then
|
||||
extra_scores="$extra_scores $dir/$sentid"
|
||||
else
|
||||
echo "$dir/$sentid" is missing >&2
|
||||
extra_scores="$extra_scores /dev/null"
|
||||
fi
|
||||
done
|
||||
|
||||
$cat $file | \
|
||||
combine-acoustic-scores \
|
||||
-v "weights=1 $extra_weights" \
|
||||
-v max_nbest=$maxnbest \
|
||||
- $extra_scores
|
||||
fi | \
|
||||
${GAWK-gawk} '
|
||||
BEGIN {
|
||||
hypnum = 0;
|
||||
}
|
||||
NF >= 3 {
|
||||
hypnum ++;
|
||||
if (hypnum > maxnbest) exit 0;
|
||||
|
||||
totalscore = $1 + lmweight * $2 + wtweight * $3;
|
||||
|
||||
if (!winner || totalscore > maxscore) {
|
||||
maxscore = totalscore;
|
||||
winner = $0;
|
||||
winrank = hypnum;
|
||||
besthyp = "";
|
||||
for (i = 4; i <= NF; i++) besthyp = besthyp " " $i;
|
||||
}
|
||||
}
|
||||
END {
|
||||
# resolve multiwords if requested
|
||||
if (multiwords) {
|
||||
gsub(multichar, " ", besthyp);
|
||||
}
|
||||
print sentid besthyp;
|
||||
printf "%s: best hyp is %d\n", sentid, winrank > "/dev/stderr";
|
||||
}
|
||||
' sentid="$sentid" lmweight="$lmweight" wtweight="$wtweight" maxnbest="$maxnbest" multiwords=$multiwords multichar="$multichar"
|
||||
done
|
||||
|
||||
85
language_model/srilm-1.7.3/utils/src/reverse-lm.gawk
Executable file
85
language_model/srilm-1.7.3/utils/src/reverse-lm.gawk
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# reverse-lm --
|
||||
# reverse N-grams in a backoff LM
|
||||
#
|
||||
# usage: reverse-lm lm-file > rev-lm-file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/reverse-lm.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
start_tag = "<s>";
|
||||
end_tag = "</s>";
|
||||
|
||||
renorm_command = "ngram -debug 1 -order 2 -lm - -renorm -write-lm -";
|
||||
}
|
||||
NF==0 {
|
||||
print | renorm_command;
|
||||
next;
|
||||
}
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
|
||||
if (order > 2) {
|
||||
print "can handle bigram LMs only" >> "/dev/stderr";
|
||||
exit(2);
|
||||
}
|
||||
print | renorm_command;
|
||||
next;
|
||||
}
|
||||
/^\\[0-9]-grams:/ {
|
||||
currorder=substr($0,2,1);
|
||||
print | renorm_command;
|
||||
next;
|
||||
}
|
||||
/^\\/ {
|
||||
print | renorm_command;
|
||||
next;
|
||||
}
|
||||
currorder == 1 {
|
||||
# unigrams are copied unchanged
|
||||
# store probs for later use
|
||||
|
||||
prob = $1;
|
||||
word = $2;
|
||||
if (word == start_tag) {
|
||||
; # get <s> unigram prob from </s>
|
||||
} else if (word == end_tag) {
|
||||
uniprob[start_tag] = uniprob[end_tag] = prob;
|
||||
} else {
|
||||
uniprob[word] = prob;
|
||||
}
|
||||
|
||||
# add dummy backoff weight
|
||||
$3 = "0";
|
||||
print | renorm_command;
|
||||
next;
|
||||
}
|
||||
|
||||
function map_tags(w) {
|
||||
if (w == start_tag) {
|
||||
return end_tag;
|
||||
} else if (w == end_tag) {
|
||||
return start_tag;
|
||||
} else {
|
||||
return w;
|
||||
}
|
||||
}
|
||||
|
||||
currorder == 2 {
|
||||
# bigrams are reverse and new probabilities are assigned
|
||||
prob = $1;
|
||||
w1 = map_tags($2);
|
||||
w2 = map_tags($3);
|
||||
|
||||
# p_rev(w1|w2) = p(w1) p(w2|w1) / p(w2)
|
||||
new_prob = uniprob[w1] + prob - uniprob[w2];
|
||||
|
||||
if (new_prob > 0) {
|
||||
print "warning: p(" w1 "|" w2 ") > 0" >> "/dev/stderr";
|
||||
}
|
||||
|
||||
print new_prob "\t" w2 " " w1 | renorm_command;
|
||||
next;
|
||||
}
|
||||
28
language_model/srilm-1.7.3/utils/src/reverse-ngram-counts.gawk
Executable file
28
language_model/srilm-1.7.3/utils/src/reverse-ngram-counts.gawk
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# reverse-ngram-counts --
|
||||
# Reverse the word order in N-gram count files
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/reverse-ngram-counts.gawk,v 1.2 2017/07/31 18:18:50 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
start_tag = "<s>";
|
||||
end_tag = "</s>";
|
||||
}
|
||||
{
|
||||
i = 1;
|
||||
j = NF - 1;
|
||||
while (i < j) {
|
||||
h = $i;
|
||||
$i = $j;
|
||||
$j = h;
|
||||
i ++; j--;
|
||||
}
|
||||
|
||||
# swap <s> and </s> tags
|
||||
for (i = 1; i < NF; i ++) {
|
||||
if ($i == end_tag) $i = start_tag;
|
||||
else if ($i == start_tag) $i = end_tag;
|
||||
}
|
||||
print;
|
||||
}
|
||||
32
language_model/srilm-1.7.3/utils/src/reverse-text.gawk
Executable file
32
language_model/srilm-1.7.3/utils/src/reverse-text.gawk
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# reverse-text --
|
||||
# Reverse the word order in a text file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/reverse-text.gawk,v 1.1 2003/01/01 18:35:23 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
start_tag = "<s>";
|
||||
end_tag = "</s>";
|
||||
}
|
||||
{
|
||||
if ($1 == start_tag) {
|
||||
i = 2;
|
||||
} else {
|
||||
i = 1;
|
||||
}
|
||||
|
||||
if ($NF == end_tag) {
|
||||
j = NF - 1;
|
||||
} else {
|
||||
j = NF;
|
||||
}
|
||||
|
||||
while (i < j) {
|
||||
h = $i;
|
||||
$i = $j;
|
||||
$j = h;
|
||||
i ++; j--;
|
||||
}
|
||||
print;
|
||||
}
|
||||
176
language_model/srilm-1.7.3/utils/src/rexport.gnumake
Executable file
176
language_model/srilm-1.7.3/utils/src/rexport.gnumake
Executable file
@@ -0,0 +1,176 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# rexport --
|
||||
# retrying export with customs, via gnumake
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rexport.gnumake,v 1.2 2011/07/21 19:48:19 stolcke Exp $
|
||||
#
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [-m] [-J numjobs] [-delay D] [-check-exec] [-f] [-debug] [-same] [-exclusive] [-exit-on-error] [-uselocal] [-attr value] ... command [args ...]" >&2
|
||||
}
|
||||
|
||||
# allow as many file descriptors as possible for pmake
|
||||
# (this command may fail in old versions of sh -- we ignore that)
|
||||
ulimit -n `ulimit -H -n 2>/dev/null` >/dev/null 2>&1
|
||||
|
||||
set -e
|
||||
|
||||
jobs=1
|
||||
makemode=0
|
||||
delay=
|
||||
check_exec=0
|
||||
exit_on_error=0
|
||||
|
||||
#
|
||||
# parse options
|
||||
#
|
||||
attributes=
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-m) makemode=1
|
||||
shift ;;
|
||||
-same) attributes="$attributes SAME"
|
||||
shift ;;
|
||||
-exclusive)
|
||||
attributes="$attributes EXCLUSIVE"
|
||||
shift ;;
|
||||
-uselocal)
|
||||
attributes="$attributes USELOCAL"
|
||||
shift ;;
|
||||
-attr) attributes="$attributes $2"
|
||||
shift; shift;;
|
||||
-debug) debug=1
|
||||
shift ;;
|
||||
-f) readfiles=1;
|
||||
shift ;;
|
||||
-J) jobs="$2"
|
||||
shift; shift ;;
|
||||
-delay) delay="$2"
|
||||
shift; shift ;;
|
||||
-check-exec)
|
||||
check_exec=1
|
||||
shift ;;
|
||||
-exit-on-error)
|
||||
exit_on_error=1
|
||||
shift ;;
|
||||
-*) usage
|
||||
exit 2 ;;
|
||||
*)
|
||||
break ;;
|
||||
esac
|
||||
done
|
||||
|
||||
#
|
||||
# parse command
|
||||
#
|
||||
|
||||
# find tmp file that doesn't exist yet
|
||||
for suffix in a b c d e f g h i j k l m n o p q r s t u v x y z
|
||||
do
|
||||
mkfile=/tmp/export$$$suffix
|
||||
if [ ! -f $mkfile ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
trap "rm -f $mkfile; exit 1" 1 2 15
|
||||
|
||||
#
|
||||
# create makefile
|
||||
#
|
||||
if [ "$#" -eq 0 -o "$readfiles" ]; then
|
||||
# read commands from files or stdin
|
||||
cat "$@"
|
||||
else
|
||||
# use what's on the command line
|
||||
echo "$@"
|
||||
fi | \
|
||||
gawk '
|
||||
BEGIN {
|
||||
ld_lib_path_var = "LD_LIBRARY_PATH";
|
||||
}
|
||||
NR == 1 {
|
||||
# always use /bin/sh for portability across platforms
|
||||
print "SHELL=/bin/sh"
|
||||
print ".cleanup: ; @/bin/rm -f " mkfile
|
||||
|
||||
jobnum = 0;
|
||||
}
|
||||
NF > 0 {
|
||||
jobnum ++;
|
||||
|
||||
job = ".job" jobnum;
|
||||
alljobs = alljobs job " ";
|
||||
|
||||
# make sure shell variable expansion is preserved
|
||||
gsub("\\$", "$$");
|
||||
|
||||
delay = delay + 0;
|
||||
|
||||
if (check_exec) {
|
||||
exec_file = "";
|
||||
for (i = 1; i <= NF; i ++) {
|
||||
if ( $i ~ "^/") {
|
||||
exec_file = $i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (exec_file) {
|
||||
sub("[;&|].*", "", exec_file);
|
||||
$0 = "while [ ! -x " exec_file " ]; do sleep 5; done; " $0
|
||||
}
|
||||
}
|
||||
|
||||
if (ld_lib_path_var in ENVIRON) {
|
||||
$0 = ld_lib_path_var "=" ENVIRON[ld_lib_path_var] "; export " ld_lib_path_var "; " $0
|
||||
}
|
||||
|
||||
if (njobs > 1) {
|
||||
if (delay > 0 && jobnum > 1) {
|
||||
prev_delay_target = delay_target;
|
||||
delay_target = "delay" jobnum;
|
||||
print delay_target ": " prev_delay_target \
|
||||
"; @sleep " delay;
|
||||
} else {
|
||||
delay_target = "";
|
||||
}
|
||||
|
||||
print job ": " delay_target "; " $0;
|
||||
} else {
|
||||
print job ": ; @" $0;
|
||||
}
|
||||
if (makemode) {
|
||||
print "\t@touch " job;
|
||||
}
|
||||
}
|
||||
END {
|
||||
print "all: " alljobs;
|
||||
|
||||
print alljobs ": .cleanup";
|
||||
|
||||
if (jobnum == 0) {
|
||||
print "warning: empty command list" > "/dev/stderr";
|
||||
}
|
||||
}
|
||||
' makemode=$makemode attributes="$attributes" mkfile="$mkfile" \
|
||||
njobs="$jobs" delay="$delay" check_exec=$check_exec \
|
||||
exit_on_error=$exit_on_error > $mkfile
|
||||
|
||||
if [ "$debug" ]; then
|
||||
cat $mkfile
|
||||
rm -f $mkfile
|
||||
exit
|
||||
fi
|
||||
|
||||
# avoid illegal values when make is invoked from other makes
|
||||
MAKEFLAGS=
|
||||
MFLAGS=
|
||||
export MAKEFLAGS MFLAGS
|
||||
|
||||
if [ $exit_on_error = 0 ]; then
|
||||
ignoreflag=-k
|
||||
fi
|
||||
|
||||
exec make -j $jobs $ignoreflag -f $mkfile all
|
||||
|
||||
35
language_model/srilm-1.7.3/utils/src/rover-control-tying.gawk
Executable file
35
language_model/srilm-1.7.3/utils/src/rover-control-tying.gawk
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# rover-control-tying --
|
||||
# extract tying information from rover-control file for use with
|
||||
# compute-best-rover-mix tying=...
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
bin = 0;
|
||||
}
|
||||
|
||||
/^##/ || /^[ ]*$/ {
|
||||
# skip comment or empty line
|
||||
next;
|
||||
}
|
||||
|
||||
$3 == "+" {
|
||||
next;
|
||||
}
|
||||
|
||||
{
|
||||
if ($4 == "") $4 = 1;
|
||||
|
||||
if ($4 == "=") {
|
||||
output = output " " bin;
|
||||
} else {
|
||||
output = output " " ++bin;
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
sub("^ ", "", output);
|
||||
print output;
|
||||
}
|
||||
|
||||
65
language_model/srilm-1.7.3/utils/src/rover-control-weights.gawk
Executable file
65
language_model/srilm-1.7.3/utils/src/rover-control-weights.gawk
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# rover-control-weights --
|
||||
# retrieve or change weights in rover-control file
|
||||
#
|
||||
# usage:
|
||||
# retrieving
|
||||
# rover-control-weights rover-control
|
||||
# changing:
|
||||
# rover-control-weights weights="..." rover-control > new-rover-control
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/rover-control-weights.gawk,v 1.3 2017/08/16 06:34:16 stolcke Exp $
|
||||
#
|
||||
|
||||
NR == 1 {
|
||||
if (weights) {
|
||||
nweights = split(weights, w);
|
||||
}
|
||||
output_weights = "";
|
||||
}
|
||||
|
||||
/^##/ || /^[ ]*$/ {
|
||||
# pass through comment or empty line
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
$3 == "+" {
|
||||
if (weights) {
|
||||
print;
|
||||
}
|
||||
next;
|
||||
}
|
||||
{
|
||||
# dir lmw wtw weight max_nbest scale
|
||||
if (weights) {
|
||||
# fill in missing parameter values
|
||||
if (NF < 2) $2 = 8;
|
||||
if (NF < 3) $3 = 0;
|
||||
|
||||
if (++ sysno <= nweights) {
|
||||
if ($4 == "=" && w[sysno] == w[sysno-1]) {
|
||||
# preserve weight tying if new weights are compatible
|
||||
;
|
||||
} else {
|
||||
$4 = w[sysno];
|
||||
}
|
||||
} else {
|
||||
$4 = 1;
|
||||
}
|
||||
print;
|
||||
} else {
|
||||
if (NF < 4) $4 = 1;
|
||||
output_weights = output_weights " " $4;
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
if (!weights) {
|
||||
sub("^ ", "", output_weights);
|
||||
print output_weights;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
267
language_model/srilm-1.7.3/utils/src/search-rover-combo
Executable file
267
language_model/srilm-1.7.3/utils/src/search-rover-combo
Executable file
@@ -0,0 +1,267 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# search-rover-combo --
|
||||
# search for best rover combination from a list of systems
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/search-rover-combo,v 1.14 2016-12-10 18:20:33 stolcke Exp $
|
||||
#
|
||||
|
||||
|
||||
scriptdir=`dirname $0`
|
||||
score_script=$scriptdir/score-hyps
|
||||
datadir=SEARCH-DATA
|
||||
weights="1"
|
||||
smooth_weight=
|
||||
sentids=-
|
||||
njobs=1
|
||||
refs=
|
||||
|
||||
# collect options
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-rover) shift
|
||||
run_rover=1
|
||||
break ;;
|
||||
-rover-optimize) shift
|
||||
run_rover_optimize=1
|
||||
break ;;
|
||||
-scorer) score_script="$2";
|
||||
shift; shift ;;
|
||||
-weights) weights="$2";
|
||||
shift; shift ;;
|
||||
-smooth-weight)
|
||||
smooth_weight="$2";
|
||||
shift; shift ;;
|
||||
-smooth-control)
|
||||
smooth_control="$2";
|
||||
shift; shift ;;
|
||||
-datadir) datadir="$2";
|
||||
shift; shift ;;
|
||||
-sentids) sentids="$2";
|
||||
shift; shift ;;
|
||||
-refs) refs="$2"
|
||||
shift; shift ;;
|
||||
-J) njobs=$2
|
||||
shift; shift ;;
|
||||
-*) echo "usage: $0 [-scorer SCRIPT] [-weights=\"W1 W2 ...\" | -refs REFS] [-smooth-weight S] [-datadir DIR] [-sentids LIST] LIST-OF-CONTROL-FILES" >&2
|
||||
exit 2 ;;
|
||||
*) break ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# see if this is a recursive evaluation to run a single nbest-rover
|
||||
if [ -n "$run_rover" ]; then
|
||||
# sentids control-file hyps-out
|
||||
nbest-rover $1 $2 > $3
|
||||
exit
|
||||
elif [ -n "$run_rover_optimize" ]; then
|
||||
# sentids control-file hyps-out refs
|
||||
nbest-rover $1 $2 /dev/null > $3-0 2>&1 \
|
||||
-refs $4 -write-ref-posteriors $3.ref-posteriors
|
||||
rm $3-0
|
||||
tying=`rover-control-tying $2`
|
||||
compute-best-rover-mix tying="$tying" $3.ref-posteriors > $3.optimize 2>&1
|
||||
|
||||
weights=`${GAWK-gawk} '/best lambda/ { sub(".*[(]", "", $0); sub("[)]", "", $0); print }' $3.optimize `
|
||||
|
||||
rover-control-weights weights="$weights" $2 > $2.optimized1
|
||||
|
||||
if [ -n "$smooth_weight" -a -n "$smooth_control" ]; then
|
||||
combine-rover-controls keeppaths=1 lambda=$smooth_weight $smooth_control $2.optimized1 > $2.optimized
|
||||
else
|
||||
mv $2.optimized1 $2.optimized
|
||||
fi
|
||||
|
||||
nbest-rover $1 $2.optimized > $3
|
||||
exit
|
||||
fi
|
||||
|
||||
rexport=${REXPORT-rexport.gnumake -exit-on-error -J $njobs -f}
|
||||
|
||||
input_list=${1-SYSTEM-LIST}
|
||||
# backward compatibility for 2nd argument
|
||||
score_script=${2-$score_script}
|
||||
# backward compatibility for 3rd argument
|
||||
datadir=${3-$datadir}
|
||||
|
||||
set -e
|
||||
|
||||
mkdir -p $datadir
|
||||
|
||||
|
||||
#
|
||||
# Step 1: compute errors for individual systems
|
||||
#
|
||||
|
||||
system_errors=$datadir/system-errors
|
||||
cmdlist=$datadir/score.rexports
|
||||
|
||||
tmpctrl=$datadir/tmp.control
|
||||
tmphyps=$datadir/tmp.hyps
|
||||
tmpscore=$datadir/tmp.score
|
||||
|
||||
sort $input_list > $datadir/sorted_inputs
|
||||
|
||||
iter=0
|
||||
iterdir=$datadir/$iter
|
||||
mkdir -p $iterdir
|
||||
|
||||
system_errors=$iterdir/system_errors
|
||||
|
||||
if [ ! -s $system_errors ]; then
|
||||
count=1
|
||||
> $cmdlist
|
||||
|
||||
cat $datadir/sorted_inputs | \
|
||||
while read roverctrl
|
||||
do
|
||||
# rewrite rover control file to adjust directory paths
|
||||
combine-rover-controls $roverctrl > $tmpctrl.$count
|
||||
|
||||
echo "$0 -rover $sentids $tmpctrl.$count $tmphyps.$count; \
|
||||
echo $roverctrl \`$score_script $tmphyps.$count\` > $tmpscore.$count" >> $cmdlist
|
||||
|
||||
count=`expr $count + 1`
|
||||
done
|
||||
|
||||
# run the scoring jobs
|
||||
if [ $njobs -lt 2 ]; then
|
||||
sh -ex $cmdlist >$cmdlist.log 2>&1
|
||||
else
|
||||
$rexport $cmdlist >$cmdlist.log 2>&1
|
||||
fi
|
||||
sort +0 -1 $tmpscore.* > $system_errors
|
||||
|
||||
rm -f $tmpctrl.* $tmphyps.* $tmpscore.*
|
||||
fi # system_errors exists
|
||||
|
||||
best_system=`sort +1n -2 $system_errors | ${GAWK-gawk} '{ print $1; exit }' `
|
||||
best_error=`sort +1n -2 $system_errors | ${GAWK-gawk} '{ print $2; exit }' `
|
||||
|
||||
echo "FIRST SYSTEM" >&2
|
||||
echo $best_system >&2
|
||||
echo "ERROR $best_error" >&2
|
||||
|
||||
echo "$best_system 1" > $iterdir/combo
|
||||
join -v 1 $datadir/sorted_inputs $iterdir/combo > $iterdir/unused
|
||||
cat $best_system > $iterdir/rover.control
|
||||
|
||||
tryall=yes
|
||||
|
||||
# if weigh testimation is used it we always add the new system at a fixed lower weight
|
||||
# than the sum of prior systems
|
||||
if [ -n "$refs" ]; then
|
||||
weights=0.5
|
||||
fi
|
||||
|
||||
while [ -s $iterdir/unused ]
|
||||
do
|
||||
newiter=`expr $iter + 1`
|
||||
newiterdir=$datadir/$newiter
|
||||
mkdir -p $newiterdir
|
||||
|
||||
echo "ITER $newiter" >&2
|
||||
|
||||
system_errors=$newiterdir/system_errors
|
||||
|
||||
if [ ! -s $system_errors ]; then
|
||||
|
||||
for weight in $weights
|
||||
do
|
||||
count=1
|
||||
> $cmdlist
|
||||
|
||||
cat $iterdir/unused | \
|
||||
while read roverctrl
|
||||
do
|
||||
combine-rover-controls keeppaths=1 lambda="1 $weight" $iterdir/rover.control $roverctrl > $tmpctrl.$count
|
||||
|
||||
if [ -n "$refs" ]; then
|
||||
# evaluate rover control file with weight optimization
|
||||
if [ -n "$smooth_weight" ]; then
|
||||
smooth="-smooth-weight $smooth_weight -smooth-control $iterdir/rover.control"
|
||||
fi
|
||||
echo "$0 $smooth -rover-optimize $sentids $tmpctrl.$count $tmphyps.$count $refs; \
|
||||
echo $roverctrl $weight \`$score_script $tmphyps.$count\` $tmpctrl.$count.optimized > $tmpscore.$count" >> $cmdlist
|
||||
else
|
||||
# evaluate rover control file without weight optimization
|
||||
echo "$0 -rover $sentids $tmpctrl.$count $tmphyps.$count; \
|
||||
echo $roverctrl $weight \`$score_script $tmphyps.$count\` $tmpctrl.$count > $tmpscore.$count" >> $cmdlist
|
||||
fi
|
||||
|
||||
count=`expr $count + 1`
|
||||
done
|
||||
|
||||
# run the scoring jobs
|
||||
if [ $njobs -lt 2 ]; then
|
||||
sh -ex $cmdlist >$cmdlist.log 2>&1
|
||||
else
|
||||
$rexport $cmdlist >$cmdlist.log 2>&1
|
||||
fi
|
||||
sort +0 -1 $tmpscore.* > $system_errors
|
||||
|
||||
${GAWK-gawk} -v old_error=$best_error '$3 < old_error' $system_errors > $system_errors.improved
|
||||
|
||||
if [ -s $system_errors.improved ]; then
|
||||
# we found at least one improvement; stop trying weights
|
||||
break;
|
||||
fi
|
||||
done
|
||||
else
|
||||
# restart search at this iteration
|
||||
${GAWK-gawk} -v old_error=$best_error '$3 < old_error' $system_errors > $system_errors.improved
|
||||
fi
|
||||
|
||||
if [ -s $system_errors.improved ]; then
|
||||
best_system=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $1; exit }' `
|
||||
best_weight=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $2; exit }' `
|
||||
best_error=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $3; exit }' `
|
||||
best_control=`sort +2n -3 $system_errors.improved | ${GAWK-gawk} '{ print $4; exit }' `
|
||||
|
||||
echo "NEXT SYSTEM" >&2
|
||||
echo "$best_system $best_weight" >&2
|
||||
echo "ERROR $best_error" >&2
|
||||
|
||||
if [ ! -s $newiterdir/rover.control ]; then
|
||||
cat $best_control > $newiterdir/rover.control
|
||||
fi
|
||||
|
||||
{ cat $iterdir/combo; echo "$best_system $best_weight"; } | sort +0 -1 > $newiterdir/combo
|
||||
${GAWK-gawk} '{ print $1 }' $system_errors.improved | \
|
||||
join -v 1 - $newiterdir/combo > $newiterdir/unused
|
||||
|
||||
tryall=yes
|
||||
else
|
||||
cat $iterdir/combo > $newiterdir/combo
|
||||
cat $iterdir/rover.control > $newiterdir/rover.control
|
||||
fi
|
||||
|
||||
rm -f $tmpctrl.* $tmphyps.* $tmpscore.*
|
||||
|
||||
if [ ! -s $newiterdir/unused -a "$tryall" ]; then
|
||||
|
||||
# no improvement -- add all previously discarded systems back into the running
|
||||
echo "EXPANDING SEARCH" >&2
|
||||
|
||||
if [ ! -f $newiterdir/combo ]; then
|
||||
# try extending the same combo again in next iteration
|
||||
cat $iterdir/combo > $newiterdir/combo
|
||||
cat $iterdir/rover.control > $newiterdir/rover.control
|
||||
fi
|
||||
|
||||
join -v 1 $datadir/sorted_inputs $newiterdir/combo > $newiterdir/unused
|
||||
|
||||
# do this only once until we can add a new system
|
||||
tryall=
|
||||
fi
|
||||
|
||||
iter=$newiter
|
||||
iterdir=$newiterdir
|
||||
done
|
||||
|
||||
echo "BEST COMBO" >&2
|
||||
cat $iterdir/combo >&2
|
||||
echo "ERROR $best_error" >&2
|
||||
|
||||
cat $iterdir/rover.control
|
||||
|
||||
551
language_model/srilm-1.7.3/utils/src/select-vocab.pl
Executable file
551
language_model/srilm-1.7.3/utils/src/select-vocab.pl
Executable file
@@ -0,0 +1,551 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# Usage: select-vocab [-quiet] -heldout file f1 f2 ... fn
|
||||
#
|
||||
# Selects a vocabulary from the union of the vocabularies of f1
|
||||
# through fn that maximizes the likelihood of the heldout file. f1
|
||||
# through fn can either be text files, count files or ARPA-style
|
||||
# back-off language models. If they are text files, further,
|
||||
# each line in them can optionally be prefixed by a sentence id, which
|
||||
# will be stripped if the file has the .sentid extension.
|
||||
#
|
||||
# Note: This implementation corrects an error in the paper [1]. The
|
||||
# EM procedure specification in [1] describes corpus level interpolation.
|
||||
# But we use word-level interpolation.
|
||||
#
|
||||
# Authors: Anand Venkataraman and Wen Wang
|
||||
# STAR Lab, SRI International, Menlo Park, CA 94025, USA.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/select-vocab.pl,v 1.7 2013/04/05 16:50:56 stolcke Exp $
|
||||
#
|
||||
|
||||
# Globals
|
||||
my $Quiet = 0; # Quiet or Verbose?
|
||||
my $Gzip = 0; # Do we have Gzip?
|
||||
|
||||
MAIN: {
|
||||
my $heldOut = ""; # Filename of the heldout corpus
|
||||
my $maxIter = 500; # Perform a maximum of this many EM iters
|
||||
my $precision = 1e-5; # Stop EM iterations when log likelihood changes less than this much
|
||||
my $scale = 1e6; # Scale final output counts by this much
|
||||
|
||||
while ($arg = shift(@ARGV)) {
|
||||
if ($arg =~ /^-h(elp)?$/) {
|
||||
usage();
|
||||
} elsif ($arg =~ /^-held(-)?(out)?$/) {
|
||||
$heldOut = shift(@ARGV);
|
||||
} elsif ($arg =~ /^-scale(-)?(counts)?$/) {
|
||||
$scale = shift(@ARGV);
|
||||
} elsif ($arg =~ /^-q(uiet)?$/) {
|
||||
$Quiet = 1;
|
||||
} elsif ($arg =~ /^-/) {
|
||||
print STDERR "Unknown option: $arg\n";
|
||||
usage();
|
||||
} else {
|
||||
unshift(@ARGV, $arg);
|
||||
last;
|
||||
}
|
||||
}
|
||||
die "$0: I need a held out corpus (-heldout) to maximize likelihood.\n" if ($heldOut eq "");
|
||||
die "$0: I need at least two corpora to combine vocabulary counts.\n" if ($#ARGV < 1);
|
||||
|
||||
# Determine whether gzip exists in the path
|
||||
#
|
||||
if (system("sh -c 'gzip -help' >/dev/null 2>&1") == 0) {
|
||||
message("I found gzip in your path. So I'll support compressed input.\n");
|
||||
$Gzip=1;
|
||||
} else {
|
||||
message("I didn't find gzip in your path. So I won't support compressed input.\n");
|
||||
$Gzip=0;
|
||||
}
|
||||
|
||||
# Make held-out counts and calculate total number of tokens.
|
||||
#
|
||||
my $heldOut_counts_ref = make_raw_counts($heldOut);
|
||||
my $numWords = 0;
|
||||
foreach my $word (keys %{$heldOut_counts_ref}) {
|
||||
$numWords += $heldOut_counts_ref->{$word};
|
||||
}
|
||||
die "$0: The held-out corpus must not be empty.\n" if ($numWords == 0);
|
||||
|
||||
|
||||
# The grand vocab is a union of all possible words, including in the Heldout set.
|
||||
#
|
||||
my $vocab = make_full_vocab($heldOut, @ARGV);
|
||||
|
||||
# Create log distributions for each of the (n > 1) corpora. The counts
|
||||
# will all use a common vocabulary that is the union of the individual
|
||||
# vocabularies. Use Witten-Bell discounting to handle zero-frequency
|
||||
# items in the normalization process.
|
||||
#
|
||||
for (my $n = 0; $n <= $#ARGV; $n++) {
|
||||
$lambda[$n] = 1/($#ARGV+1);
|
||||
$logprobs_refs[$n] = estimate_logprobs($ARGV[$n], $vocab);
|
||||
}
|
||||
message("Iter 0: lambdas = (@lambda)\n");
|
||||
|
||||
# Now perform EM. Iterate to increase the likelihood of the heldout set.
|
||||
# Procedure halts when the likelihood changes by less than $precision
|
||||
# after an iteration. See Eqns. (3)-(6) of Venkataraman & Wang, 2003.
|
||||
#
|
||||
$done = 0;
|
||||
$iter = 0;
|
||||
while (!$done && $iter < $maxIter) {
|
||||
$done = 1;
|
||||
$iter++;
|
||||
|
||||
my $loglike = 0;
|
||||
@post_totals = ();
|
||||
|
||||
# Calculate log lambdas.
|
||||
#
|
||||
for (my $n = 0; $n <= $#ARGV; $n++) {
|
||||
$log_lambda[$n] = log($lambda[$n]);
|
||||
}
|
||||
|
||||
# Estimate lambdas per word and average over all words.
|
||||
#
|
||||
foreach my $word (keys %{$heldOut_counts_ref}) {
|
||||
undef $log_numer_sum;
|
||||
|
||||
for (my $n = 0; $n <= $#ARGV; $n++) {
|
||||
$log_numer[$n] = $log_lambda[$n] + $logprobs_refs[$n]->{$word};
|
||||
$log_numer_sum = logsum($log_numer_sum, $log_numer[$n]);
|
||||
}
|
||||
$loglike += $log_numer_sum * $heldOut_counts_ref->{$word};
|
||||
|
||||
for (my $n = 0; $n <= $#ARGV; $n++) {
|
||||
$post_totals[$n] += exp($log_numer[$n] - $log_numer_sum) * $heldOut_counts_ref->{$word};
|
||||
}
|
||||
}
|
||||
|
||||
for (my $n = 0; $n <= $#ARGV; $n++) {
|
||||
$lambda_prime[$n] = $post_totals[$n]/$numWords;
|
||||
$delta[$n] = abs($lambda_prime[$n] - $lambda[$n]);
|
||||
$done = 0 if ($delta[$n] > $precision);
|
||||
}
|
||||
|
||||
@lambda = @lambda_prime;
|
||||
|
||||
next if $Quiet;
|
||||
for (my $n = 0; $n <= $#lambda_prime; $n++) {
|
||||
$lambda_trunc[$n] = sprintf("%0.6f", $lambda[$n]);
|
||||
}
|
||||
my $ppl_trunc = sprintf("%.4f", exp(-$loglike/$numWords));
|
||||
my $loglike_trunc = sprintf("%.4f", $loglike);
|
||||
message("Iter $iter: lambdas = (@lambda_trunc) log P(held-out) = $loglike_trunc PPL = $ppl_trunc\n");
|
||||
}
|
||||
|
||||
# Compute the combined counts.
|
||||
#
|
||||
message("Combining counts.\n");
|
||||
undef %counts;
|
||||
foreach my $word (keys %{$vocab}) {
|
||||
for (my $n = 0; $n <= $#ARGV; $n++) {
|
||||
$counts{$word} += $lambda[$n] * exp($logprobs_refs[$n]->{$word});
|
||||
}
|
||||
}
|
||||
|
||||
# Print out the final vocab with the combined counts scaled by $scale.
|
||||
#
|
||||
foreach my $word (keys %counts) {
|
||||
my $score = $counts{$word} * $scale;
|
||||
print "$word\t $score\n";
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
|
||||
# Return a ref to a hash of normalized counts. Use the given vocabulary
|
||||
# and Witten-Bell (1991) smoothing to ensure non-zero probabilities.
|
||||
#
|
||||
sub estimate_logprobs {
|
||||
my($f, $voc_ref) = @_;
|
||||
|
||||
message("Estimating logprobs for $f. ");
|
||||
|
||||
my $counts_ref = make_raw_counts($f);
|
||||
|
||||
my $sumcounts = 0;
|
||||
foreach my $word (keys %{$counts_ref}) {
|
||||
$sumcounts += $counts_ref->{$word};
|
||||
}
|
||||
|
||||
# Compute the number of "novel" words. i.e. words in vocab, but
|
||||
# not in counts.
|
||||
#
|
||||
my $vocabsize = scalar keys %{$voc_ref};
|
||||
my $nwords = scalar keys %{$counts_ref};
|
||||
my $num_novel = $vocabsize - $nwords;
|
||||
message("It has all but $num_novel vocabulary words.\n");
|
||||
|
||||
# If there are no novel words, just normalize and return;
|
||||
#
|
||||
if (!$num_novel) {
|
||||
foreach my $word (keys %{$counts_ref}) {
|
||||
$counts_ref->{$word} = log($counts_ref->{$word}) - log($sumcounts);
|
||||
}
|
||||
return $counts_ref;
|
||||
}
|
||||
|
||||
# Create keys for novel words.
|
||||
#
|
||||
foreach my $word (keys %{$voc_ref}) {
|
||||
$counts_ref->{$word} += 0;
|
||||
}
|
||||
|
||||
# If the sum of the counts is less than zero, we probably got them from a
|
||||
# language model that already smoothed the unigram counts. So we use the left over
|
||||
# mass for novel words. Otherwise, if the sum is equal to 1, we rescale the
|
||||
# probabilities by 0.9 (until a better way can be found), and use the remaining
|
||||
# mass to distribute. If the counts are > 1, then we perform smoothing ourselves.
|
||||
#
|
||||
if ($sumcounts < 1) {
|
||||
my $novel_mass = 1-$sumcounts;
|
||||
message("\tSum of counts in $f is only $sumcounts\n");
|
||||
message("\tWill distribute probabilty mass of $novel_mass over novel words\n");
|
||||
my $novel_logprob = log(1-$sumcounts) - log($num_novel);
|
||||
foreach my $word (keys %{$counts_ref}) {
|
||||
if ($counts_ref->{$word}) {
|
||||
$counts_ref->{$word} = log($counts_ref->{$word});
|
||||
} else {
|
||||
$counts_ref->{$word} = $novel_logprob;
|
||||
}
|
||||
}
|
||||
return $counts_ref;
|
||||
}
|
||||
|
||||
if ($sumcounts == 1) {
|
||||
message("\tSum of counts in $f is exactly 1\n");
|
||||
message("\tWill scale them by 0.9 and use 0.1 for novel words.\n");
|
||||
my $novel_logprob = log(0.1/$num_novel);
|
||||
foreach my $word (keys %{$counts_ref}) {
|
||||
if ($counts_ref->{$word}) {
|
||||
$counts_ref->{$word} = log($counts_ref->{$word} * 0.9);
|
||||
} else {
|
||||
$counts_ref->{$word} = $novel_logprob;
|
||||
}
|
||||
}
|
||||
return $counts_ref;
|
||||
}
|
||||
|
||||
# Normalize and smooth. Note that in calculating the probability of novel words,
|
||||
# the Witten-Bell estimate for the novel event is $nwords/($sum_counts+$nwords).
|
||||
# This mass is shared equally by each of the novel words and hence $num_novel in
|
||||
# the denominator.
|
||||
#
|
||||
foreach my $word (keys %{$counts_ref}) {
|
||||
if ($counts_ref->{$word}) {
|
||||
$counts_ref->{$word} = log($counts_ref->{$word}/($sumcounts + $nwords));
|
||||
} else {
|
||||
$counts_ref->{$word} = log($nwords) - log($sumcounts + $nwords) - log($num_novel);
|
||||
}
|
||||
}
|
||||
|
||||
return $counts_ref;
|
||||
}
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# The following subroutines construct the vocabulary from various kinds
|
||||
# of input files.
|
||||
#
|
||||
sub make_full_vocab {
|
||||
my @files = @_;
|
||||
my %voc;
|
||||
|
||||
foreach my $f (@files) {
|
||||
$ftype = getftype($f);
|
||||
if ($ftype eq "text") {
|
||||
message("Adding words from text file $f into vocabulary.\n");
|
||||
add_vocab_from_text(\%voc, $f);
|
||||
} elsif ($ftype eq "sentid") {
|
||||
message("Adding words from sentID file $f into vocabulary.\n");
|
||||
add_vocab_from_sentid(\%voc, $f);
|
||||
} elsif ($ftype eq "counts") {
|
||||
message("Adding words from counts file $f into vocabulary.\n");
|
||||
add_vocab_from_counts(\%voc, $f);
|
||||
} elsif ($ftype eq "arpa-lm") {
|
||||
message("Adding words from ARPA-style LM file $f into vocabulary.\n");
|
||||
add_vocab_from_lm(\%voc, $f);
|
||||
} else {
|
||||
die "I don't know the file type for $f. Giving up.\n";
|
||||
}
|
||||
}
|
||||
return \%voc;
|
||||
}
|
||||
|
||||
sub add_vocab_from_text {
|
||||
my($voc_ref, $f) = @_;
|
||||
|
||||
my $in = zopen($f);
|
||||
while (my $line = <$in>) {
|
||||
my @words = split(/\s+/, $line);
|
||||
foreach my $word (@words) {
|
||||
$voc_ref->{$word} = 0;
|
||||
}
|
||||
}
|
||||
close($in);
|
||||
}
|
||||
|
||||
# Same as above, but gets rid of sentid (first word on each line)
|
||||
#
|
||||
sub add_vocab_from_sentid {
|
||||
my($voc_ref, $f) = @_;
|
||||
|
||||
my $in = zopen($f);
|
||||
while (my $line = <$in>) {
|
||||
my @words = split(/\s+/, $line);
|
||||
shift(@words); # Toss sentid
|
||||
foreach my $word (@words) {
|
||||
$voc_ref->{$word} = 0;
|
||||
}
|
||||
}
|
||||
close($in);
|
||||
}
|
||||
|
||||
# Same as above, but only uses the first word of each line. Each line
|
||||
# in a count file will have two fields -- word count
|
||||
#
|
||||
sub add_vocab_from_counts {
|
||||
my($voc_ref, $f) = @_;
|
||||
|
||||
my $in = zopen($f);
|
||||
while (my $line = <$in>) {
|
||||
my @fields = split(/\s+/, $line);
|
||||
next if $line =~ /^\s*$/ || $#fields > 1; # Ignore non-unigram counts
|
||||
next if $fields[0] =~ /<.*>/; # Skip pseudo words.
|
||||
$voc_ref->{$fields[0]} = 0;
|
||||
}
|
||||
close($in);
|
||||
}
|
||||
|
||||
# Same as above, but only takes probabilities from the unigram
|
||||
# portion of the arpa-format lm.
|
||||
#
|
||||
sub add_vocab_from_lm {
|
||||
my($voc_ref, $f) = @_;
|
||||
|
||||
my $in = zopen($f);
|
||||
|
||||
# Locate unigram section
|
||||
while (my $line = <$in>) {
|
||||
last if $line =~ /^\\1-grams:/;
|
||||
}
|
||||
|
||||
# Read unigrams into vocab
|
||||
while (my $line = <$in>) {
|
||||
last if /^\\2-grams:/;
|
||||
my ($logprob, $word, @rest) = split(/\s+/, $line);
|
||||
next if $word =~ /(^\s*$)|(<.*>)/; # Skip pseudo words.
|
||||
$voc_ref->{$word} = 0;
|
||||
}
|
||||
|
||||
close($in);
|
||||
}
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# The following subroutines are very similar to the ones above.
|
||||
# They return a ref to a hash of unnormalized counts from various kinds
|
||||
# of input files.
|
||||
#
|
||||
sub make_raw_counts {
|
||||
my($f) = @_;
|
||||
|
||||
$ftype = getftype($f);
|
||||
if ($ftype eq "text") {
|
||||
return make_raw_counts_from_text($f);
|
||||
} elsif ($ftype eq "sentid") {
|
||||
return make_raw_counts_from_sentid($f);
|
||||
} elsif ($ftype eq "counts") {
|
||||
return make_raw_counts_from_counts($f);
|
||||
} elsif ($ftype eq "arpa-lm") {
|
||||
return make_raw_counts_from_lm($f);
|
||||
} else {
|
||||
die "I don't know the file type for $f. Giving up.\n";
|
||||
}
|
||||
}
|
||||
|
||||
sub make_raw_counts_from_text {
|
||||
my($f) = @_;
|
||||
my %counts;
|
||||
|
||||
my $in = zopen($f);
|
||||
while (my $line = <$in>) {
|
||||
my @words = split(/\s+/, $line);
|
||||
foreach my $word (@words) {
|
||||
$counts{$word}++;
|
||||
}
|
||||
}
|
||||
close($in);
|
||||
return \%counts;
|
||||
}
|
||||
|
||||
sub make_raw_counts_from_sentid {
|
||||
my($f) = @_;
|
||||
my %counts;
|
||||
|
||||
my $in = zopen($f);
|
||||
while (my $line = <$in>) {
|
||||
my @words = split(/\s+/, $line);
|
||||
shift (@words); # Toss sentid
|
||||
foreach my $word (@words) {
|
||||
$counts{$word}++;
|
||||
}
|
||||
}
|
||||
close($in);
|
||||
return \%counts;
|
||||
}
|
||||
|
||||
sub make_raw_counts_from_counts {
|
||||
my($f) = @_;
|
||||
my %counts;
|
||||
|
||||
my $in = zopen($f);
|
||||
while (my $line = <$in>) {
|
||||
my @fields = split(/\s+/, $line);
|
||||
next if $line =~ /^\s*$/ || $#fields > 1; # Ignore non-unigram counts.
|
||||
next if $fields[0] =~ /<.*>/; # Skip pseudo words.
|
||||
$counts{$fields[0]} += $fields[1];
|
||||
}
|
||||
close($in);
|
||||
return \%counts;
|
||||
}
|
||||
|
||||
# Well, the counts from the lm aren't going to be raw. We just have to
|
||||
# settle for the normalized counts.
|
||||
#
|
||||
sub make_raw_counts_from_lm {
|
||||
my($f) = @_;
|
||||
my %counts;
|
||||
|
||||
my $in = zopen($f);
|
||||
|
||||
# Locate unigram section
|
||||
while (my $line = <$in>) {
|
||||
last if $line =~ /^\\1-grams:/;
|
||||
}
|
||||
|
||||
# Read in unigram counts
|
||||
while (my $line = <$in>) {
|
||||
last if $line =~ /^\\2-grams:/;
|
||||
my ($logprob, $word) = split(/\s+/, $line);
|
||||
next if $word =~ /(^\s*$)|(<.*>)/; # Skip pseudo words.
|
||||
$counts{$word} += 10**$logprob;
|
||||
}
|
||||
close($in);
|
||||
|
||||
return \%counts;
|
||||
}
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
sub getftype {
|
||||
my($f) = @_;
|
||||
|
||||
# First check if it is a sentid file. If necessary insert further checks
|
||||
# by looking into the file.
|
||||
#
|
||||
return "sentid" if ($f =~ /\.sentid(\.gz|\.Z)?$/);
|
||||
|
||||
# Extract the first five lines from the file to make our decision.
|
||||
#
|
||||
my $in = zopen($f);
|
||||
for (my $i = 0; $i < 5; $i++) {
|
||||
$lines[$i] = <$in> || last;
|
||||
}
|
||||
close($in);
|
||||
|
||||
# Is it a count file? Assume it is and try to falsify from the
|
||||
# first 5 lines. Format should be -- word count \n
|
||||
#
|
||||
my $countfile = 1;
|
||||
for (my $i = 0; $i < 5; $i++) {
|
||||
my @words = split(/\s+/, $lines[$i]);
|
||||
if ($words[$#words] !~ /\d+/) {
|
||||
$countfile = 0;
|
||||
last;
|
||||
}
|
||||
}
|
||||
return "counts" if ($countfile == 1);
|
||||
|
||||
# Is it an arpa-style language model?
|
||||
#
|
||||
my $s = join(' ', @lines);
|
||||
return "arpa-lm" if ($s =~ /\s*\\data\\\s*ngram\s+1\s*=/);
|
||||
|
||||
# Otherwise, assume it is a text file.
|
||||
#
|
||||
return "text";
|
||||
}
|
||||
|
||||
# Given log(x) and log(y), this function returns log(x+y).
|
||||
#
|
||||
sub logsum {
|
||||
my($x,$y) = @_;
|
||||
my $z;
|
||||
|
||||
if (!defined($x)) {
|
||||
$z = $y;
|
||||
} elsif (!defined($y)) {
|
||||
$z = $x;
|
||||
} else {
|
||||
$z = ($x < $y)? logsum($y,$x) : $x + log(1+exp($y-$x));
|
||||
}
|
||||
return $z;
|
||||
}
|
||||
|
||||
sub message {
|
||||
my($msg) = @_;
|
||||
|
||||
return if ($Quiet);
|
||||
print STDERR "$msg";
|
||||
}
|
||||
|
||||
# Opens a possibly compressed file. Only uncomment the gzip line
|
||||
# if gzip is available. Otherwise, compressed files aren't supported.
|
||||
#
|
||||
sub zopen {
|
||||
my($f) = @_;
|
||||
local *IN;
|
||||
|
||||
die "$f is not a file.\n" if ! -f $f;
|
||||
|
||||
if (!$Gzip) {
|
||||
open(IN, $f) || die "$f: $!\n";
|
||||
} else {
|
||||
open(IN, "gzip -dfc $f |") || die "gzip -dfc $f: $!\n";
|
||||
}
|
||||
|
||||
return *IN;
|
||||
}
|
||||
|
||||
sub usage {
|
||||
print STDERR <<" .;";
|
||||
|
||||
Usage:
|
||||
$0 [-quiet] [-scale n] -heldout corp_h corp1 corp2 ...
|
||||
|
||||
Estimate weighted and combined counts for the words in the vocabulary.
|
||||
These weights maximize the likelihood of the heldout corpus, corp_h, by
|
||||
the Witten-Bell smoothed mixture unigram language models from corp_1 through
|
||||
corp_n.
|
||||
|
||||
-quiet stops debug style messages while running.
|
||||
-scale n causes final combined counts to be scaled by n.
|
||||
|
||||
.;
|
||||
|
||||
exit 1;
|
||||
}
|
||||
|
||||
#---------------------------------------------------------------------------------
|
||||
# References.
|
||||
#
|
||||
# 1. Venkataraman, A. and W. Wang, (2003). "Techniques for effective vocabulary
|
||||
# selection", in Proceedings of Eurospeech'03, Geneva, 2003.
|
||||
#
|
||||
# 2. Witten, I. H. and T. C. Bell, (1991). "The zero-frequency problem:
|
||||
# Estimating the probabilities of novel events in adaptive text compression",
|
||||
# IEEE Trans. IT, 37, pp. 1085-1091.
|
||||
145
language_model/srilm-1.7.3/utils/src/sentid-to-ctm.gawk
Executable file
145
language_model/srilm-1.7.3/utils/src/sentid-to-ctm.gawk
Executable file
@@ -0,0 +1,145 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# sentid-to-ctm --
|
||||
# Format a sentid transcript file into CTM format, faking time marks
|
||||
# by spacing words evenly across the duration of the segment
|
||||
#
|
||||
# Note: this script makes assumptions about the structure of sentence
|
||||
# ID, specifically, how they encode speakers and timemarks.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/sentid-to-ctm.gawk,v 1.11 2019/02/09 07:31:37 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
# time to leave at edges of segments
|
||||
delta = 0.07;
|
||||
|
||||
pause = "-pau-";
|
||||
reject = "@reject@";
|
||||
|
||||
sort_cmd = "sort -b -k 1,1 -k 2,2 -k 3,3n";
|
||||
}
|
||||
|
||||
# read confidences and/or segment information if given
|
||||
NR == 1 {
|
||||
if (confidences) {
|
||||
while ((getline line < confidences) > 0) {
|
||||
nvalues = split(line, a);
|
||||
if (nvalues > 0) {
|
||||
conf_lines[a[1]] = line;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (segments) {
|
||||
while ((getline line < segments) > 0) {
|
||||
nvalues = split(line, a);
|
||||
if (nvalues == 5) {
|
||||
sentid = a[1];
|
||||
segment_conv[sentid] = a[2];
|
||||
segment_channel[sentid] = a[3];
|
||||
segment_start[sentid] = a[4];
|
||||
segment_end[sentid] = a[5];
|
||||
}
|
||||
}
|
||||
close(segments);
|
||||
}
|
||||
}
|
||||
|
||||
function is_nonspeech(w) {
|
||||
return w == pause || w == reject || w ~/^\[.*\]$/ || w ~/^<.*>$/;
|
||||
}
|
||||
|
||||
{
|
||||
orig_sentid = sentid = $1;
|
||||
|
||||
# strip speaker diacritics
|
||||
sub("_s[1-9]$", "", sentid);
|
||||
|
||||
if (segments && sentid in segment_start) {
|
||||
conv = segment_conv[sentid];
|
||||
channel = segment_channel[sentid];
|
||||
start_offset = segment_start[sentid];
|
||||
end_offset = segment_end[sentid];
|
||||
# derive channel and time information from sentids
|
||||
# look for a pattern that encodes channel and
|
||||
# start/end times
|
||||
} else if (match(sentid, "_[0-9]_[-0-9][0-9]*_[0-9][0-9]*$")) {
|
||||
# waveforms with [012] channel id, timemarks 1/1000s
|
||||
# NOTE: this form is used by the segmenter
|
||||
conv = substr(sentid, 1, RSTART-1);
|
||||
split(substr(sentid, RSTART+1), sentid_parts, "_");
|
||||
channel = sentid_parts[1];
|
||||
start_offset = sentid_parts[2] / 1000;
|
||||
end_offset = sentid_parts[3] / 1000;
|
||||
} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*$")) {
|
||||
conv = substr(sentid, 1, RSTART-1);
|
||||
split(substr(sentid, RSTART+1), sentid_parts, "_");
|
||||
channel = sentid_parts[1];
|
||||
start_offset = sentid_parts[2] / 100;
|
||||
end_offset = sentid_parts[3] / 100;
|
||||
# new sentids used by Ramana for SPINE segmentations
|
||||
} else if (match(sentid, "_[AB]_[-0-9][0-9]*_[0-9][0-9]*_[-0-9][0-9]*_[0-9][0-9]*$")) {
|
||||
conv = substr(sentid, 1, RSTART-1);
|
||||
split(substr(sentid, RSTART+1), sentid_parts, "_");
|
||||
channel = sentid_parts[1];
|
||||
start_offset = (sentid_parts[2]+sentid_parts[4]) / 100;
|
||||
end_offset = (sentid_parts[2]+sentid_parts[5]) / 100;
|
||||
} else {
|
||||
print "cannot parse sentid " sentid >> "/dev/stderr";
|
||||
conv = sentid;
|
||||
channel = "?";
|
||||
start_offset = 0;
|
||||
end_offset = 10000;
|
||||
}
|
||||
|
||||
$1 = "";
|
||||
$0 = $0;
|
||||
|
||||
numwords = NF;
|
||||
|
||||
if (numwords > 0) {
|
||||
word_dur = (end_offset - start_offset - 2 * delta)/numwords;
|
||||
} else {
|
||||
word_dur = 0;
|
||||
}
|
||||
|
||||
# find confidence values for this sentid
|
||||
if (confidences) {
|
||||
if (!(orig_sentid in conf_lines)) {
|
||||
print "no confidences for " orig_sentid >> "/dev/stderr";
|
||||
} else {
|
||||
delete conf_values;
|
||||
n_conf_values = \
|
||||
split(conf_lines[orig_sentid], conf_values);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 1; i <= numwords; i ++) {
|
||||
if (is_nonspeech($i)) continue;
|
||||
|
||||
start_time = start_offset + delta + (i - 1) * word_dur;
|
||||
|
||||
if (i + 1 in conf_values) {
|
||||
conf_value = conf_values[i + 1];
|
||||
} else {
|
||||
conf_value = 0;
|
||||
}
|
||||
|
||||
# split multiwords
|
||||
ncomps = split($i, word_comps, "_");
|
||||
|
||||
for (j = 1; j <= ncomps; j ++) {
|
||||
print conv, channel, \
|
||||
start_time + (j - 1) * word_dur/ncomps,\
|
||||
word_dur/ncomps, \
|
||||
toupper(word_comps[j]), \
|
||||
conf_value | sort_cmd;
|
||||
}
|
||||
}
|
||||
|
||||
if (orig_sentid in conf_lines && numwords != n_conf_values - 1) {
|
||||
print "mismatched number of confidences for " orig_sentid \
|
||||
>> "/dev/stderr";
|
||||
}
|
||||
}
|
||||
60
language_model/srilm-1.7.3/utils/src/sentid-to-sclite.gawk
Executable file
60
language_model/srilm-1.7.3/utils/src/sentid-to-sclite.gawk
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# sentid-to-sclite --
|
||||
# convert sentid transcription format to sclite 'trn' format
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/sentid-to-sclite.gawk,v 1.5 2016/09/23 20:05:51 stolcke Exp $
|
||||
#
|
||||
# i.e.:
|
||||
# sentid word1 word2 ....
|
||||
#
|
||||
# becomes
|
||||
#
|
||||
# word1 word2 ... (sentid)
|
||||
#
|
||||
# The sentid is formatted to contain exactly one underscore,
|
||||
# as sclite uses the first portion of the id as a speaker label to
|
||||
# group results.
|
||||
#
|
||||
BEGIN {
|
||||
format_sentids = 1;
|
||||
}
|
||||
|
||||
{
|
||||
sentid = $1;
|
||||
$1 = "";
|
||||
|
||||
if (format_sentids) {
|
||||
# reformat sentid
|
||||
|
||||
# <conv>_<channel>_<utterance> -> <conv><channel>_<utterance>
|
||||
sub("[-_]A", "A", sentid);
|
||||
sub("[-_]B", "B", sentid);
|
||||
sub("[-_]ch1", "ch1", sentid);
|
||||
sub("[-_]ch2", "ch2", sentid);
|
||||
|
||||
# remove underscore after corpus tag, if any
|
||||
if (sentid ~ /^[a-z][a-z]*[-_][0-9]/) {
|
||||
sub("[-_]", "", sentid);
|
||||
}
|
||||
|
||||
# <conv>_<channel>_<utterance> -> <conv><channel>_<utterance>
|
||||
sub("[-_]A", "A", sentid);
|
||||
sub("[-_]B", "B", sentid);
|
||||
sub("[-_]ch1", "ch1", sentid);
|
||||
sub("[-_]ch2", "ch2", sentid);
|
||||
|
||||
# work around problems with negative start times in sentids
|
||||
sub("_-", "_m", sentid);
|
||||
|
||||
#
|
||||
# for sentid not containing _ or -, fake a speaker id out of the first
|
||||
# three characters (this works for ATIS ...)
|
||||
#
|
||||
if (! (sentid ~ /[-_]/)) {
|
||||
sentid = substr(sentid, 1, 3) "_" sentid;
|
||||
}
|
||||
}
|
||||
|
||||
print $0, "(" sentid ")";
|
||||
}
|
||||
56
language_model/srilm-1.7.3/utils/src/sort-lm.gawk
Executable file
56
language_model/srilm-1.7.3/utils/src/sort-lm.gawk
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# sort-lm --
|
||||
# sort the ngrams in an LM in lexicographic order, as required for
|
||||
# some other LM software (notably CMU's).
|
||||
#
|
||||
# usage: sort-lm lm-file > sorted-lm-file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/sort-lm.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
sorter = "";
|
||||
currorder = 0;
|
||||
}
|
||||
NF==0 {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
/^ngram *[0-9][0-9]*=/ {
|
||||
order = substr($2,1,index($2,"=")-1);
|
||||
print;
|
||||
next;
|
||||
}
|
||||
/^\\[0-9]-grams:/ {
|
||||
if (sorter) {
|
||||
close(sorter);
|
||||
}
|
||||
|
||||
currorder = substr($0,2,1);
|
||||
print;
|
||||
fflush();
|
||||
|
||||
# set up new sorting pipeline;
|
||||
sorter = "sort";
|
||||
for (i = 1; i <= currorder; i ++) {
|
||||
sorter = sorter " +" i " -" (i+1);
|
||||
}
|
||||
# print sorter >> "/dev/stderr";
|
||||
next;
|
||||
}
|
||||
/^\\/ {
|
||||
if (sorter) {
|
||||
close(sorter);
|
||||
sorter = "";
|
||||
}
|
||||
currorder = 0;
|
||||
print; next;
|
||||
}
|
||||
currorder && NF > 1 {
|
||||
print | sorter;
|
||||
next;
|
||||
}
|
||||
{
|
||||
print;
|
||||
}
|
||||
57
language_model/srilm-1.7.3/utils/src/split-tagged-ngrams.gawk
Executable file
57
language_model/srilm-1.7.3/utils/src/split-tagged-ngrams.gawk
Executable file
@@ -0,0 +1,57 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# split-tagged-ngrams --
|
||||
# multiply tagged-word ngrams out into ngrams that contain
|
||||
# combinations of words and tags
|
||||
#
|
||||
# sample input:
|
||||
# a/A b/B 10
|
||||
# sample output:
|
||||
# a b 10
|
||||
# a B 10
|
||||
# A b 10
|
||||
# A B 10
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/split-tagged-ngrams.gawk,v 1.2 2006/02/11 01:31:32 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
separator = "/";
|
||||
}
|
||||
|
||||
# recursive expansion of the tagged-word ngram
|
||||
function expand_ngram(ng, n, suffix, c,
|
||||
word, tag, word_tag) {
|
||||
if (n == 0) {
|
||||
print suffix, c;
|
||||
} else {
|
||||
last_item = ng[n];
|
||||
|
||||
if (split(last_item, word_tag, separator) == 2) {
|
||||
word = word_tag[1];
|
||||
tag = word_tag[2];
|
||||
expand_ngram(ng, n-1, word " " suffix, c);
|
||||
expand_ngram(ng, n-1, tag " " suffix, c);
|
||||
} else {
|
||||
expand_ngram(ng, n-1, last_item " " suffix, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NF > 1 {
|
||||
count = $NF;
|
||||
|
||||
delete ngram;
|
||||
for (i = 1; i < NF; i ++) {
|
||||
ngram[i] = $i;
|
||||
}
|
||||
|
||||
expand_ngram(ngram, NF - 1, "", count);
|
||||
|
||||
next;
|
||||
}
|
||||
|
||||
{
|
||||
print;
|
||||
}
|
||||
|
||||
42
language_model/srilm-1.7.3/utils/src/subset-context-ngrams.gawk
Executable file
42
language_model/srilm-1.7.3/utils/src/subset-context-ngrams.gawk
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# subset-context-ngrams --
|
||||
# Extract counts corresponding to ngram contexts
|
||||
#
|
||||
# usage: subset-context-ngrams contexts=FILE COUNTS > SUBSET
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/subset-context-ngrams.gawk,v 1.1 2008/09/30 03:54:05 stolcke Exp $
|
||||
#
|
||||
|
||||
# read contexts
|
||||
NR == 1 {
|
||||
saveline = $0;
|
||||
|
||||
if (contexts != "") {
|
||||
howmany = 0;
|
||||
while ((getline < contexts) > 0) {
|
||||
if (NF < 2) continue;
|
||||
$NF = "";
|
||||
subset_contexts[$0 FS] = 1;
|
||||
howmany ++;
|
||||
}
|
||||
print "read " howmany " contexts" > "/dev/stderr";
|
||||
}
|
||||
|
||||
$0 = saveline;
|
||||
}
|
||||
|
||||
NF == 2 {
|
||||
print;
|
||||
next;
|
||||
}
|
||||
|
||||
NF > 2 {
|
||||
saveline = $0;
|
||||
|
||||
$NF = $(NF-1) = "";
|
||||
if ($0 in subset_contexts) {
|
||||
print saveline;
|
||||
}
|
||||
}
|
||||
|
||||
44
language_model/srilm-1.7.3/utils/src/subtract-ppls.gawk
Executable file
44
language_model/srilm-1.7.3/utils/src/subtract-ppls.gawk
Executable file
@@ -0,0 +1,44 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# subtract-ppls --
|
||||
# Subtracts text statistics (from -ppl output)
|
||||
#
|
||||
# The first input file contains a total, from which subsequent stats are
|
||||
# discounted. The result is printed in a format compatible with -ppl.
|
||||
#
|
||||
# Copyright (c) 1995, SRI International. All Rights Reserved
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/subtract-ppls.gawk,v 1.2 1997/07/12 05:01:08 stolcke Exp $
|
||||
#
|
||||
/^file .*: .* sentences/ {
|
||||
if (ARGIND == 1) {
|
||||
totalsents = $3;
|
||||
totalwords = $5;
|
||||
totaloovs = $7;
|
||||
} else {
|
||||
totalsents -= $3;
|
||||
totalwords -= $5;
|
||||
totaloovs -= $7;
|
||||
}
|
||||
|
||||
getline;
|
||||
|
||||
if (ARGIND == 1) {
|
||||
zeroprobs = $1;
|
||||
totalprob = $4;
|
||||
} else {
|
||||
zeroprobs -= $1;
|
||||
totalprob -= $4;
|
||||
}
|
||||
}
|
||||
END {
|
||||
M_LN10 = 2.30258509299404568402; # from <math.h>
|
||||
|
||||
ppl = exp (- M_LN10 * totalprob / \
|
||||
(totalwords - totaloovs - zeroprobs + totalsents));
|
||||
|
||||
printf "file TOTAL: %d sentences, %d words, %d OOVs\n", \
|
||||
totalsents, totalwords, totaloovs;
|
||||
printf "%d zeroprobs, logprob= %g ppl= %g\n", \
|
||||
zeroprobs, totalprob, ppl;
|
||||
}
|
||||
13
language_model/srilm-1.7.3/utils/src/tolower-ngram-counts.gawk
Executable file
13
language_model/srilm-1.7.3/utils/src/tolower-ngram-counts.gawk
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# tolower-ngram-counts --
|
||||
# Map N-gram counts to lowercase
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/tolower-ngram-counts.gawk,v 1.1 2007/07/13 23:38:22 stolcke Exp $
|
||||
#
|
||||
{
|
||||
for (i = 1; i < NF; i ++) {
|
||||
$i = tolower($i);
|
||||
}
|
||||
print;
|
||||
}
|
||||
65
language_model/srilm-1.7.3/utils/src/uniform-classes.gawk
Executable file
65
language_model/srilm-1.7.3/utils/src/uniform-classes.gawk
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# uniform-classes --
|
||||
# Assign uniform membership probabilities to word class expansions
|
||||
# that don't already have probabilities
|
||||
#
|
||||
# usage: uniform-clases CLASSFILE > UNIFORM-CLASSFILE
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/uniform-classes.gawk,v 1.3 2016/05/13 23:00:35 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
num_class_defs = 0;
|
||||
}
|
||||
|
||||
{
|
||||
line = $0;
|
||||
|
||||
n = split(line, a);
|
||||
if (n == 0) next;
|
||||
|
||||
class = a[1];
|
||||
num_exp = ++ num_class_expansions[class];
|
||||
|
||||
if (a[2] ~ /^[-+]?[.]?[0-9][0-9.]*(e[+-]?[0-9]+)?$/) {
|
||||
prob = a[2];
|
||||
i = 3;
|
||||
} else {
|
||||
prob = "";
|
||||
i = 2;
|
||||
}
|
||||
|
||||
expansion = a[i];
|
||||
for (i++; i <= n; i++) {
|
||||
expansion = expansion " " a[i];
|
||||
}
|
||||
|
||||
class_expansions[class " " num_exp] = expansion;
|
||||
if (prob != "") {
|
||||
class_expansion_probs[class " " num_exp] = prob;
|
||||
}
|
||||
num_class_defs ++;
|
||||
}
|
||||
|
||||
END {
|
||||
print "read " num_class_defs " class expansions" >> "/dev/stderr";
|
||||
|
||||
# assign default expansion probs
|
||||
|
||||
for (class in num_class_expansions) {
|
||||
|
||||
num_exp = num_class_expansions[class];
|
||||
|
||||
for (i = 1; i <= num_exp; i ++) {
|
||||
prob = class_expansion_probs[class " " i];
|
||||
|
||||
if (prob == "") {
|
||||
prob = 1/num_exp;
|
||||
}
|
||||
|
||||
print class, prob, class_expansions[class " " i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
36
language_model/srilm-1.7.3/utils/src/uniq-ngram-counts.gawk
Executable file
36
language_model/srilm-1.7.3/utils/src/uniq-ngram-counts.gawk
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# uniq-ngram-counts --
|
||||
# Collapse identical successive N-grams in counts file
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/uniq-ngram-counts.gawk,v 1.2 2007/07/13 23:50:28 stolcke Exp $
|
||||
#
|
||||
{
|
||||
if (NF == 1) {
|
||||
ngram = " ";
|
||||
} else {
|
||||
ngram = "";
|
||||
}
|
||||
|
||||
for (i = 1; i < NF; i ++) {
|
||||
ngram = ngram " " $i;
|
||||
}
|
||||
|
||||
# starting ngrams with space character forces string comparison
|
||||
if (ngram != last_ngram) {
|
||||
if (last_ngram != "") {
|
||||
# avoid outputting initial space
|
||||
print substr(last_ngram, 2), total_count;
|
||||
}
|
||||
total_count = 0;
|
||||
last_ngram = ngram;
|
||||
}
|
||||
|
||||
total_count += $NF;
|
||||
}
|
||||
|
||||
END {
|
||||
if (last_ngram != "") {
|
||||
print substr(last_ngram, 2), total_count;
|
||||
}
|
||||
}
|
||||
79
language_model/srilm-1.7.3/utils/src/vp2text.gawk
Executable file
79
language_model/srilm-1.7.3/utils/src/vp2text.gawk
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# vp2text --
|
||||
# Convert the ARPA CSR vp (verbalized punctiation) format to plain
|
||||
# text for LM training.
|
||||
#
|
||||
# This combines the functionality of Roni Rosenfeld's "vp2svp1" and
|
||||
# "sgml2text" utilities (except for case mapping). No <s> and </s>
|
||||
# tags are retained, since our LM software doesn't need them.
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/vp2text.gawk,v 1.2 1996/09/17 21:59:57 stolcke Exp $
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
iquote = 0;
|
||||
nquote = 5;
|
||||
}
|
||||
# Reset the quote counter at article boundaries
|
||||
/^<art\./ {
|
||||
iquote = 0;
|
||||
}
|
||||
/^<DOC/ {
|
||||
iquote = 0;
|
||||
}
|
||||
#
|
||||
# Filter out SGML tags
|
||||
#
|
||||
/^</ {
|
||||
next;
|
||||
}
|
||||
#
|
||||
# Do all the easy replacements
|
||||
{
|
||||
# These are pronounced
|
||||
gsub("@AT-SIGN", "at");
|
||||
gsub("&ERSAND", "and");
|
||||
gsub("\\+PLUS", "plus");
|
||||
gsub("=EQUALS", "equals");
|
||||
gsub("%PERCENT", "percent");
|
||||
gsub("/SLASH", "slash");
|
||||
gsub("\\.POINT", "point");
|
||||
|
||||
# These aren't
|
||||
gsub(",COMMA", "");
|
||||
gsub("\\?QUESTION-MARK", "");
|
||||
gsub(":COLON", "");
|
||||
gsub("\#SHARP-SIGN", "");
|
||||
gsub("'SINGLE-QUOTE", "");
|
||||
gsub(";SEMI-COLON", "");
|
||||
gsub("!EXCLAMATION-POINT", "");
|
||||
gsub("{LEFT-BRACE", "");
|
||||
gsub("}RIGHT-BRACE", "");
|
||||
gsub("\\(LEFT-PAREN", "");
|
||||
gsub("\\)RIGHT-PAREN", "");
|
||||
gsub("\\.PERIOD", "");
|
||||
gsub("\\.\\.\\.ELLIPSIS", "");
|
||||
gsub("--DASH", "");
|
||||
gsub("-HYPHEN", "");
|
||||
}
|
||||
# Handle lines containing "DOUBLE-QUOTE as a special case since this
|
||||
# is more costly: replace every nquote'th occurrence with "quote", else
|
||||
# delete it.
|
||||
/"DOUBLE-QUOTE/ {
|
||||
output = "";
|
||||
for (i = 1; i <= NF; i++) {
|
||||
if ($i == "\"DOUBLE-QUOTE") {
|
||||
if ((iquote++) % nquote == 0) {
|
||||
output = output " quote";
|
||||
}
|
||||
} else {
|
||||
output = output " " $i;
|
||||
}
|
||||
}
|
||||
print output;
|
||||
next;
|
||||
}
|
||||
{
|
||||
print;
|
||||
}
|
||||
138
language_model/srilm-1.7.3/utils/src/wlat-stats.gawk
Executable file
138
language_model/srilm-1.7.3/utils/src/wlat-stats.gawk
Executable file
@@ -0,0 +1,138 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# wlat-stats --
|
||||
# Compute statistics of word posterior lattices
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/wlat-stats.gawk,v 1.6 2019/07/24 16:16:55 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
name = "";
|
||||
nhyps = 0;
|
||||
entropy = 0;
|
||||
nwords = 0;
|
||||
ewords = 0; # posterior expected words
|
||||
|
||||
nsub = nins = ndel = 0; # 1best error counts
|
||||
min_errs = 0; # oracle error count
|
||||
|
||||
M_LN10 = 2.30258509299404568402;
|
||||
|
||||
empty_hyp = "*DELETE*";
|
||||
|
||||
total_posterior = 1;
|
||||
}
|
||||
|
||||
$1 == "name" {
|
||||
name = $2;
|
||||
next;
|
||||
}
|
||||
|
||||
$1 == "posterior" {
|
||||
total_posterior = $2;
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# word lattice format:
|
||||
# node 46 them 11 0.011827 45 0.0111445 13 0.000682478 ...
|
||||
#
|
||||
$1 == "node" {
|
||||
word = $3;
|
||||
posterior = $5;
|
||||
|
||||
if (word != "NULL") {
|
||||
nhyps ++;
|
||||
}
|
||||
|
||||
if (posterior > 0) {
|
||||
for (i = 6; i <= NF; i += 2) {
|
||||
prob = $(i + 1);
|
||||
|
||||
if (prob > 0) {
|
||||
entropy -= prob * log(prob/posterior);
|
||||
if (word != "NULL") {
|
||||
ewords += prob;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# confusion network format:
|
||||
# align 4 okay 0.998848 ok 0.00113834 i 1.06794e-08 a 4.48887e-08 ...
|
||||
#
|
||||
$1 == "align" {
|
||||
align_pos = $2;
|
||||
|
||||
best_hyp = "";
|
||||
best_posterior = 0;
|
||||
delete all_hyps;
|
||||
for (i = 3; i <= NF; i += 2) {
|
||||
word = $i;
|
||||
|
||||
if (word != "*DELETE*") {
|
||||
nhyps ++;
|
||||
}
|
||||
|
||||
prob = $(i + 1);
|
||||
if (prob > 0) {
|
||||
entropy -= prob/total_posterior * log(prob/total_posterior);
|
||||
all_hyps[word] = 1;
|
||||
|
||||
if (word != "*DELETE*") {
|
||||
ewords += prob/total_posterior;
|
||||
}
|
||||
}
|
||||
|
||||
if (prob > best_posterior) {
|
||||
best_posterior = prob;
|
||||
best_hyp = word;
|
||||
}
|
||||
}
|
||||
}
|
||||
$1 == "reference" && $2 == align_pos {
|
||||
if ($3 != empty_hyp) {
|
||||
nwords ++;
|
||||
|
||||
if (best_hyp == empty_hyp) {
|
||||
ndel ++;
|
||||
} else if (best_hyp != $3) {
|
||||
nsub ++;
|
||||
}
|
||||
} else {
|
||||
if (best_hyp != empty_hyp) {
|
||||
nins ++;
|
||||
}
|
||||
}
|
||||
|
||||
# update oracle error
|
||||
if (!($3 in all_hyps)) {
|
||||
min_errs ++;
|
||||
}
|
||||
|
||||
align_pos = -1;
|
||||
}
|
||||
|
||||
END {
|
||||
printf name (name != "" ? " " : "") \
|
||||
nhyps " hypotheses " \
|
||||
entropy/M_LN10 " entropy " \
|
||||
ewords " ewords";
|
||||
if (nwords > 0) {
|
||||
printf " " nwords " words " nhyps/nwords " hyps/word " \
|
||||
entropy/M_LN10/nwords " entropy/word";
|
||||
}
|
||||
printf "\n";
|
||||
if (nwords > 0) {
|
||||
nerrors = nsub + nins + ndel;
|
||||
printf name (name != "" ? " " : "") \
|
||||
nerrors " errors " nerrors*100/nwords " WER " \
|
||||
nsub*100/nwords " SUB " nins*100/nwords " INS " \
|
||||
ndel*100/nwords " DEL\n";
|
||||
|
||||
printf name (name != "" ? " " : "") \
|
||||
min_errs " minerrors " min_errs*100/nwords " minWER\n";
|
||||
}
|
||||
}
|
||||
|
||||
105
language_model/srilm-1.7.3/utils/src/wlat-to-dot.gawk
Executable file
105
language_model/srilm-1.7.3/utils/src/wlat-to-dot.gawk
Executable file
@@ -0,0 +1,105 @@
|
||||
#!/usr/local/bin/gawk -f
|
||||
#
|
||||
# wlat-to-dot --
|
||||
# Generate dot(1) graph description from word lattice generates by
|
||||
# nbest-lattice(1)
|
||||
#
|
||||
# usage: wlat-to-dot [show_probs=1] file.wlat > file.dot
|
||||
#
|
||||
# $Header: /home/srilm/CVS/srilm/utils/src/wlat-to-dot.gawk,v 1.6 2004/11/02 02:00:35 stolcke Exp $
|
||||
#
|
||||
BEGIN {
|
||||
name = "WLAT";
|
||||
show_probs = 0;
|
||||
show_nums = 0;
|
||||
|
||||
version = 1;
|
||||
}
|
||||
$1 == "name" {
|
||||
name = $2;
|
||||
}
|
||||
|
||||
#
|
||||
# nbest-lattice output (without -use-mesh)
|
||||
#
|
||||
$1 == "initial" {
|
||||
print "digraph \"" name "\" {";
|
||||
print "rankdir = LR";
|
||||
|
||||
i = $2;
|
||||
}
|
||||
$1 == "final" {
|
||||
i = $2;
|
||||
}
|
||||
$1 == "version" {
|
||||
version = $2;
|
||||
}
|
||||
$1 == "node" && version == 1 {
|
||||
from = $2;
|
||||
word = $3;
|
||||
post = $4;
|
||||
|
||||
print "\tnode" from " [label=\"" word \
|
||||
(!show_nums ? "" : ("/" from)) \
|
||||
(!show_probs ? "" : "\\n" post ) "\"]";
|
||||
|
||||
for (i = 5; i <= NF; i ++) {
|
||||
to = $i;
|
||||
print "\tnode" from " -> node" to ";"
|
||||
}
|
||||
}
|
||||
$1 == "node" && version == 2 {
|
||||
from = $2;
|
||||
word = $3;
|
||||
align = $4;
|
||||
post = $5;
|
||||
|
||||
print "\tnode" from " [label=\"" word \
|
||||
(!show_nums ? "" : ("/" from)) \
|
||||
"\\n" align \
|
||||
(!show_probs ? "" : "/" post ) "\"]";
|
||||
|
||||
for (i = 6; i <= NF; i += 2) {
|
||||
to = $i;
|
||||
print "\tnode" from " -> node" to \
|
||||
(!show_probs ? "" : " [label=\"" $(i + 1) "\"]") ";"
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# nbest-lattice -use-mesh output (confusion networks)
|
||||
#
|
||||
|
||||
$1 == "numaligns" {
|
||||
print "digraph \"" name "\" {";
|
||||
print "rankdir = LR";
|
||||
|
||||
print "node0 [label=\"" (show_nums ? 0 : "") "\"]";
|
||||
}
|
||||
|
||||
$1 == "align" {
|
||||
|
||||
pos = $2;
|
||||
|
||||
for (i = 3; i <= NF; i += 2) {
|
||||
word = $i;
|
||||
posterior = $(i + 1);
|
||||
|
||||
if (posterior == 0) {
|
||||
print "align " pos ", word " word \
|
||||
": zero posterior, omitting it" >> "/dev/stderr";
|
||||
continue;
|
||||
}
|
||||
|
||||
print "node" pos " -> node" (pos + 1) \
|
||||
" [label=\"" word \
|
||||
(show_probs ? ("\\n" posterior) : "") \
|
||||
"\"]";
|
||||
}
|
||||
print "node" (pos + 1) " [label=\"" (show_nums ? (pos + 1) : "") "\"]";
|
||||
}
|
||||
|
||||
END {
|
||||
print "}"
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user