Files
b2txt25/language_model/tools/format_data.sh
2025-07-02 12:18:09 -07:00

167 lines
5.2 KiB
Bash
Executable File

#!/bin/bash
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Mobvoi Corporation (Author: Di Wu)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh
nj=1
cmd=run.pl
nlsyms=""
lang=""
feat=""
feat_type="kaldi"
oov="<unk>"
bpecode=""
allow_one_column=false
raw=""
verbose=0
trans_type=char
filetype=""
preprocess_conf=""
category=""
out="" # If omitted, write in stdout
help_message=$(cat << EOF
Usage: $0 <data-dir> <dict>
e.g. $0 data/train data/lang_1char/train_units.txt
Options:
--nj <nj> # number of parallel jobs
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
--feat <feat-scp> # feat.scp or feat1.scp,feat2.scp,...
--feat-type <feat-type> # kaldi or wav
--oov <oov-word> # Default: <unk>
--out <outputfile> # If omitted, write in stdout
--filetype <mat|hdf5|sound.hdf5> # Specify the format of feats file
--preprocess-conf <json> # Apply preprocess to feats when creating shape.scp
--verbose <num> # Default: 0
EOF
)
. tools/parse_options.sh
if [ $# != 2 ]; then
echo "${help_message}" 1>&2
exit 1;
fi
set -euo pipefail
dir=$1
dic=$2
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
#trap 'rm -rf ${tmpdir}' EXIT
# 1. Create scp files for inputs
# These are not necessary for decoding mode, and make it as an option
input=
if [ -n "${feat}" ]; then
_feat_scps=$(echo "${feat}" | tr ',' ' ' )
read -r -a feat_scps <<< $_feat_scps
num_feats=${#feat_scps[@]}
for (( i=1; i<=num_feats; i++ )); do
feat=${feat_scps[$((i-1))]}
mkdir -p ${tmpdir}/input_${i}
input+="input_${i} "
cat ${feat} > ${tmpdir}/input_${i}/feat.scp
# Dump in the "legacy" style JSON format
if [ -n "${filetype}" ]; then
awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \
> ${tmpdir}/input_${i}/filetype.scp
fi
if [ ${feat_type} == "kaldi" ]; then
tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \
--filetype "${filetype}" \
--preprocess-conf "${preprocess_conf}" \
--verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp
elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then
if [ -f $dir/segments ]; then
# used for segmented wav.scp
awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur
fi
if [ ! -f $dir/utt2dur ]; then
tools/wav_to_duration.sh --nj ${nj} \
${feat} ${tmpdir}/input_${i}/shape.scp
# use the existed utt2dur as shape.scp directly
else
cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp
fi
fi
done
fi
# 2. Create scp files for outputs
mkdir -p ${tmpdir}/output
if [ -n "${bpecode}" ]; then
if [ "${trans_type}" == "cn_char_en_bpe" ]; then
tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
else
paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \
| tools/spm_encode --model=${bpecode} --output_format=piece) \
> ${tmpdir}/output/token.scp
fi
elif [ -n "${nlsyms}" ]; then
tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
elif [ -n "${raw}" ]; then
cat $dir/text > ${tmpdir}/output/token.scp
else
tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
fi
< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
odim=$(cat ${dic} | wc -l)
< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp
cat ${dir}/text > ${tmpdir}/output/text.scp
# 3. Create scp files for the others
mkdir -p ${tmpdir}/other
if [ -n "${lang}" ]; then
awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp
fi
if [ -n "${category}" ]; then
awk -v category=${category} '{print $1 " " category}' ${dir}/text \
> ${tmpdir}/other/category.scp
fi
#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp
# 4. Merge scp files into a one file
opts=""
for intype in ${input} output other; do
if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then
continue
fi
if [ ${intype} != other ]; then
opts+="--${intype%_*}-scps "
else
opts+="--scps "
fi
for x in "${tmpdir}/${intype}"/*.scp; do
k=$(basename ${x} .scp)
if [ ${k} = shape ]; then
opts+="shape:${x}:shape "
else
opts+="${k}:${x} "
fi
done
done
if ${allow_one_column}; then
opts+="--allow-one-column true "
else
opts+="--allow-one-column false "
fi
if [ -n "${out}" ]; then
opts+="-O ${out}"
fi
tools/merge_scp2txt.py --verbose ${verbose} ${opts}
#rm -fr ${tmpdir}