#!/usr/bin/env python3 # encoding: utf-8 from __future__ import print_function from __future__ import unicode_literals import argparse import codecs from distutils.util import strtobool from io import open import logging import sys PY2 = sys.version_info[0] == 2 sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) sys.stdout = codecs.getwriter('utf-8')( sys.stdout if PY2 else sys.stdout.buffer) # Special types: def shape(x): """Change str to List[int] >>> shape('3,5') [3, 5] >>> shape(' [3, 5] ') [3, 5] """ # x: ' [3, 5] ' -> '3, 5' x = x.strip() if x[0] == '[': x = x[1:] if x[-1] == ']': x = x[:-1] return list(map(int, x.split(','))) def get_parser(): parser = argparse.ArgumentParser( description='Given each file paths with such format as ' '::. type> can be omitted and the default ' 'is "str". e.g. {} ' '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' '--output-scps text:data/text shape:data/utt2text_shape:shape ' '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-scps', type=str, nargs='*', action='append', default=[], help='files for the inputs') parser.add_argument('--output-scps', type=str, nargs='*', action='append', default=[], help='files for the outputs') parser.add_argument('--scps', type=str, nargs='+', default=[], help='The files except for the input and outputs') parser.add_argument('--verbose', '-V', default=1, type=int, help='Verbose option') parser.add_argument('--allow-one-column', type=strtobool, default=False, help='Allow one column in input scp files. ' 'In this case, the value will be empty string.') parser.add_argument('--out', '-O', type=str, help='The output filename. ' 'If omitted, then output to sys.stdout') return parser if __name__ == '__main__': parser = get_parser() args = parser.parse_args() args.scps = [args.scps] # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) inputs = {} assert (len(args.input_scps) == 1) for f in args.input_scps[0]: arr = f.strip().split(':') inputs[arr[0]] = arr[1] assert ('feat' in inputs) assert ('shape' in inputs) outputs = {} assert (len(args.output_scps) == 1) for f in args.output_scps[0]: arr = f.strip().split(':') outputs[arr[0]] = arr[1] assert ('shape' in outputs) assert ('text' in outputs) assert ('token' in outputs) assert ('tokenid' in outputs) files = [ inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], outputs['tokenid'], outputs['shape'] ] fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] fids = [open(f, 'r', encoding='utf-8') for f in files] if args.out is None: out = sys.stdout else: out = open(args.out, 'w', encoding='utf-8') done = False while not done: for i, fid in enumerate(fids): line = fid.readline() if line == '': done = True break arr = line.strip().split() content = ' '.join(arr[1:]) if i == 0: out.write('utt:{}'.format(arr[0])) out.write('\t') out.write('{}:{}'.format(fields[i], content)) out.write('\n') for f in fids: f.close() if args.out is not None: out.close()