60 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			60 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/bin/bash
 | |
| 
 | |
| # koried, 10/29/2012
 | |
| 
 | |
| # Reduce a data set based on a list of turn-ids
 | |
| 
 | |
| help_message="usage: $0 srcdir turnlist destdir"
 | |
| 
 | |
| if [ $1 == "--help" ]; then
 | |
|     echo "${help_message}"
 | |
|     exit 0;
 | |
| fi
 | |
| 
 | |
| if [ $# != 3 ]; then
 | |
|     echo "${help_message}"
 | |
|     exit 1;
 | |
| fi
 | |
| 
 | |
| srcdir=$1
 | |
| reclist=$2
 | |
| destdir=$3
 | |
| 
 | |
| if [ ! -f ${srcdir}/utt2spk ]; then
 | |
| echo "$0: no such file $srcdir/utt2spk"
 | |
| exit 1;
 | |
| fi
 | |
| 
 | |
| function do_filtering {
 | |
| # assumes the utt2spk and spk2utt files already exist.
 | |
|     [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
 | |
|     [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
 | |
|     [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
 | |
|     [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
 | |
|     [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
 | |
|     [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
 | |
|     if [ -f ${srcdir}/segments ]; then
 | |
|         utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
 | |
|         awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
 | |
|         # The next line would override the command above for wav.scp, which would be incorrect.
 | |
|         [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
 | |
|         [ -f ${srcdir}/reco2file_and_channel ] && \
 | |
|             utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
 | |
| 
 | |
|         # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
 | |
|         [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
 | |
|         rm ${destdir}/reco
 | |
|     fi
 | |
|     srcutts=$(wc -l < ${srcdir}/utt2spk)
 | |
|     destutts=$(wc -l < ${destdir}/utt2spk)
 | |
|     echo "Reduced #utt from $srcutts to $destutts"
 | |
| }
 | |
| 
 | |
| mkdir -p ${destdir}
 | |
| 
 | |
| # filter the utt2spk based on the set of recordings
 | |
| utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
 | |
| 
 | |
| utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
 | |
| do_filtering;
 | 
