competition update
This commit is contained in:
59
language_model/tools/reduce_data_dir.sh
Executable file
59
language_model/tools/reduce_data_dir.sh
Executable file
@@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
# koried, 10/29/2012
|
||||
|
||||
# Reduce a data set based on a list of turn-ids
|
||||
|
||||
help_message="usage: $0 srcdir turnlist destdir"
|
||||
|
||||
if [ $1 == "--help" ]; then
|
||||
echo "${help_message}"
|
||||
exit 0;
|
||||
fi
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "${help_message}"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
srcdir=$1
|
||||
reclist=$2
|
||||
destdir=$3
|
||||
|
||||
if [ ! -f ${srcdir}/utt2spk ]; then
|
||||
echo "$0: no such file $srcdir/utt2spk"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
function do_filtering {
|
||||
# assumes the utt2spk and spk2utt files already exist.
|
||||
[ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
|
||||
[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
|
||||
[ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
|
||||
[ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
|
||||
[ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
|
||||
[ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
|
||||
if [ -f ${srcdir}/segments ]; then
|
||||
utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
|
||||
awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
|
||||
# The next line would override the command above for wav.scp, which would be incorrect.
|
||||
[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
|
||||
[ -f ${srcdir}/reco2file_and_channel ] && \
|
||||
utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
|
||||
|
||||
# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
|
||||
[ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
|
||||
rm ${destdir}/reco
|
||||
fi
|
||||
srcutts=$(wc -l < ${srcdir}/utt2spk)
|
||||
destutts=$(wc -l < ${destdir}/utt2spk)
|
||||
echo "Reduced #utt from $srcutts to $destutts"
|
||||
}
|
||||
|
||||
mkdir -p ${destdir}
|
||||
|
||||
# filter the utt2spk based on the set of recordings
|
||||
utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
|
||||
|
||||
utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
|
||||
do_filtering;
|
Reference in New Issue
Block a user