146 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			146 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| # encoding: utf-8
 | |
| 
 | |
| from __future__ import print_function
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import argparse
 | |
| import codecs
 | |
| from distutils.util import strtobool
 | |
| from io import open
 | |
| import logging
 | |
| import sys
 | |
| 
 | |
| PY2 = sys.version_info[0] == 2
 | |
| sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer)
 | |
| sys.stdout = codecs.getwriter('utf-8')(
 | |
|     sys.stdout if PY2 else sys.stdout.buffer)
 | |
| 
 | |
| 
 | |
| # Special types:
 | |
| def shape(x):
 | |
|     """Change str to List[int]
 | |
| 
 | |
|     >>> shape('3,5')
 | |
|     [3, 5]
 | |
|     >>> shape(' [3, 5] ')
 | |
|     [3, 5]
 | |
| 
 | |
|     """
 | |
| 
 | |
|     # x: ' [3, 5] ' -> '3, 5'
 | |
|     x = x.strip()
 | |
|     if x[0] == '[':
 | |
|         x = x[1:]
 | |
|     if x[-1] == ']':
 | |
|         x = x[:-1]
 | |
| 
 | |
|     return list(map(int, x.split(',')))
 | |
| 
 | |
| 
 | |
| def get_parser():
 | |
|     parser = argparse.ArgumentParser(
 | |
|         description='Given each file paths with such format as '
 | |
|         '<key>:<file>:<type>. type> can be omitted and the default '
 | |
|         'is "str". e.g. {} '
 | |
|         '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape '
 | |
|         '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape '
 | |
|         '--output-scps text:data/text shape:data/utt2text_shape:shape '
 | |
|         '--scps utt2spk:data/utt2spk'.format(sys.argv[0]),
 | |
|         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 | |
|     parser.add_argument('--input-scps',
 | |
|                         type=str,
 | |
|                         nargs='*',
 | |
|                         action='append',
 | |
|                         default=[],
 | |
|                         help='files for the inputs')
 | |
|     parser.add_argument('--output-scps',
 | |
|                         type=str,
 | |
|                         nargs='*',
 | |
|                         action='append',
 | |
|                         default=[],
 | |
|                         help='files for the outputs')
 | |
|     parser.add_argument('--scps',
 | |
|                         type=str,
 | |
|                         nargs='+',
 | |
|                         default=[],
 | |
|                         help='The files except for the input and outputs')
 | |
|     parser.add_argument('--verbose',
 | |
|                         '-V',
 | |
|                         default=1,
 | |
|                         type=int,
 | |
|                         help='Verbose option')
 | |
|     parser.add_argument('--allow-one-column',
 | |
|                         type=strtobool,
 | |
|                         default=False,
 | |
|                         help='Allow one column in input scp files. '
 | |
|                         'In this case, the value will be empty string.')
 | |
|     parser.add_argument('--out',
 | |
|                         '-O',
 | |
|                         type=str,
 | |
|                         help='The output filename. '
 | |
|                         'If omitted, then output to sys.stdout')
 | |
|     return parser
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     parser = get_parser()
 | |
|     args = parser.parse_args()
 | |
|     args.scps = [args.scps]
 | |
| 
 | |
|     # logging info
 | |
|     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
 | |
|     if args.verbose > 0:
 | |
|         logging.basicConfig(level=logging.INFO, format=logfmt)
 | |
|     else:
 | |
|         logging.basicConfig(level=logging.WARN, format=logfmt)
 | |
| 
 | |
|     inputs = {}
 | |
|     assert (len(args.input_scps) == 1)
 | |
|     for f in args.input_scps[0]:
 | |
|         arr = f.strip().split(':')
 | |
|         inputs[arr[0]] = arr[1]
 | |
|     assert ('feat' in inputs)
 | |
|     assert ('shape' in inputs)
 | |
| 
 | |
|     outputs = {}
 | |
|     assert (len(args.output_scps) == 1)
 | |
|     for f in args.output_scps[0]:
 | |
|         arr = f.strip().split(':')
 | |
|         outputs[arr[0]] = arr[1]
 | |
|     assert ('shape' in outputs)
 | |
|     assert ('text' in outputs)
 | |
|     assert ('token' in outputs)
 | |
|     assert ('tokenid' in outputs)
 | |
| 
 | |
|     files = [
 | |
|         inputs['feat'], inputs['shape'], outputs['text'], outputs['token'],
 | |
|         outputs['tokenid'], outputs['shape']
 | |
|     ]
 | |
|     fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape']
 | |
|     fids = [open(f, 'r', encoding='utf-8') for f in files]
 | |
| 
 | |
|     if args.out is None:
 | |
|         out = sys.stdout
 | |
|     else:
 | |
|         out = open(args.out, 'w', encoding='utf-8')
 | |
|     done = False
 | |
|     while not done:
 | |
|         for i, fid in enumerate(fids):
 | |
|             line = fid.readline()
 | |
|             if line == '':
 | |
|                 done = True
 | |
|                 break
 | |
|             arr = line.strip().split()
 | |
|             content = ' '.join(arr[1:])
 | |
|             if i == 0:
 | |
|                 out.write('utt:{}'.format(arr[0]))
 | |
|             out.write('\t')
 | |
|             out.write('{}:{}'.format(fields[i], content))
 | |
|         out.write('\n')
 | |
| 
 | |
|     for f in fids:
 | |
|         f.close()
 | |
|     if args.out is not None:
 | |
|         out.close()
 | 
