501 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			501 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3
 | ||
| # -*- coding: utf-8 -*-
 | ||
| 
 | ||
| 
 | ||
| import re, sys, unicodedata
 | ||
| import codecs
 | ||
| 
 | ||
| remove_tag = True
 | ||
| spacelist= [' ', '\t', '\r', '\n']
 | ||
| puncts = ['!', ',', '?',
 | ||
|           '、', '。', '!', ',', ';', '?',
 | ||
|           ':', '「', '」', '︰',  '『', '』', '《', '》']
 | ||
| 
 | ||
| def characterize(string) :
 | ||
|   res = []
 | ||
|   i = 0
 | ||
|   while i < len(string):
 | ||
|     char = string[i]
 | ||
|     if char in puncts:
 | ||
|       i += 1
 | ||
|       continue
 | ||
|     cat1 = unicodedata.category(char)
 | ||
|     #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
 | ||
|     if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned
 | ||
|        i += 1
 | ||
|        continue
 | ||
|     if cat1 == 'Lo': # letter-other
 | ||
|        res.append(char)
 | ||
|        i += 1
 | ||
|     else:
 | ||
|        # some input looks like: <unk><noise>, we want to separate it to two words.
 | ||
|        sep = ' '
 | ||
|        if char == '<': sep = '>'
 | ||
|        j = i+1
 | ||
|        while j < len(string):
 | ||
|          c = string[j]
 | ||
|          if ord(c) >= 128 or (c in spacelist) or (c==sep):
 | ||
|            break
 | ||
|          j += 1
 | ||
|        if j < len(string) and string[j] == '>':
 | ||
|          j += 1
 | ||
|        res.append(string[i:j])
 | ||
|        i = j
 | ||
|   return res
 | ||
| 
 | ||
| def stripoff_tags(x):
 | ||
|   if not x: return ''
 | ||
|   chars = []
 | ||
|   i = 0; T=len(x)
 | ||
|   while i < T:
 | ||
|     if x[i] == '<':
 | ||
|       while i < T and x[i] != '>':
 | ||
|          i += 1
 | ||
|       i += 1
 | ||
|     else:
 | ||
|       chars.append(x[i])
 | ||
|       i += 1
 | ||
|   return ''.join(chars)
 | ||
| 
 | ||
| 
 | ||
| def normalize(sentence, ignore_words, cs, split=None):
 | ||
|     """ sentence, ignore_words are both in unicode
 | ||
|     """
 | ||
|     new_sentence = []
 | ||
|     for token in sentence:
 | ||
|         x = token
 | ||
|         if not cs:
 | ||
|            x = x.upper()
 | ||
|         if x in ignore_words:
 | ||
|            continue
 | ||
|         if remove_tag:
 | ||
|           x = stripoff_tags(x)
 | ||
|         if not x:
 | ||
|           continue
 | ||
|         if split and x in split:
 | ||
|           new_sentence += split[x]
 | ||
|         else:
 | ||
|           new_sentence.append(x)
 | ||
|     return new_sentence
 | ||
| 
 | ||
| class Calculator :
 | ||
|   def __init__(self) :
 | ||
|     self.data = {}
 | ||
|     self.space = []
 | ||
|     self.cost = {}
 | ||
|     self.cost['cor'] = 0
 | ||
|     self.cost['sub'] = 1
 | ||
|     self.cost['del'] = 1
 | ||
|     self.cost['ins'] = 1
 | ||
|   def calculate(self, lab, rec) :
 | ||
|     # Initialization
 | ||
|     lab.insert(0, '')
 | ||
|     rec.insert(0, '')
 | ||
|     while len(self.space) < len(lab) :
 | ||
|       self.space.append([])
 | ||
|     for row in self.space :
 | ||
|       for element in row :
 | ||
|         element['dist'] = 0
 | ||
|         element['error'] = 'non'
 | ||
|       while len(row) < len(rec) :
 | ||
|         row.append({'dist' : 0, 'error' : 'non'})
 | ||
|     for i in range(len(lab)) :
 | ||
|       self.space[i][0]['dist'] = i
 | ||
|       self.space[i][0]['error'] = 'del'
 | ||
|     for j in range(len(rec)) :
 | ||
|       self.space[0][j]['dist'] = j
 | ||
|       self.space[0][j]['error'] = 'ins'
 | ||
|     self.space[0][0]['error'] = 'non'
 | ||
|     for token in lab :
 | ||
|       if token not in self.data and len(token) > 0 :
 | ||
|         self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
 | ||
|     for token in rec :
 | ||
|       if token not in self.data and len(token) > 0 :
 | ||
|         self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
 | ||
|     # Computing edit distance
 | ||
|     for i, lab_token in enumerate(lab) :
 | ||
|       for j, rec_token in enumerate(rec) :
 | ||
|         if i == 0 or j == 0 :
 | ||
|           continue
 | ||
|         min_dist = sys.maxsize
 | ||
|         min_error = 'none'
 | ||
|         dist = self.space[i-1][j]['dist'] + self.cost['del']
 | ||
|         error = 'del'
 | ||
|         if dist < min_dist :
 | ||
|           min_dist = dist
 | ||
|           min_error = error
 | ||
|         dist = self.space[i][j-1]['dist'] + self.cost['ins']
 | ||
|         error = 'ins'
 | ||
|         if dist < min_dist :
 | ||
|           min_dist = dist
 | ||
|           min_error = error
 | ||
|         if lab_token == rec_token :
 | ||
|           dist = self.space[i-1][j-1]['dist'] + self.cost['cor']
 | ||
|           error = 'cor'
 | ||
|         else :
 | ||
|           dist = self.space[i-1][j-1]['dist'] + self.cost['sub']
 | ||
|           error = 'sub'
 | ||
|         if dist < min_dist :
 | ||
|           min_dist = dist
 | ||
|           min_error = error
 | ||
|         self.space[i][j]['dist'] = min_dist
 | ||
|         self.space[i][j]['error'] = min_error
 | ||
|     # Tracing back
 | ||
|     result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
 | ||
|     i = len(lab) - 1
 | ||
|     j = len(rec) - 1
 | ||
|     while True :
 | ||
|       if self.space[i][j]['error'] == 'cor' : # correct
 | ||
|         if len(lab[i]) > 0 :
 | ||
|           self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
 | ||
|           self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
 | ||
|           result['all'] = result['all'] + 1
 | ||
|           result['cor'] = result['cor'] + 1
 | ||
|         result['lab'].insert(0, lab[i])
 | ||
|         result['rec'].insert(0, rec[j])
 | ||
|         i = i - 1
 | ||
|         j = j - 1
 | ||
|       elif self.space[i][j]['error'] == 'sub' : # substitution
 | ||
|         if len(lab[i]) > 0 :
 | ||
|           self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
 | ||
|           self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
 | ||
|           result['all'] = result['all'] + 1
 | ||
|           result['sub'] = result['sub'] + 1
 | ||
|         result['lab'].insert(0, lab[i])
 | ||
|         result['rec'].insert(0, rec[j])
 | ||
|         i = i - 1
 | ||
|         j = j - 1
 | ||
|       elif self.space[i][j]['error'] == 'del' : # deletion
 | ||
|         if len(lab[i]) > 0 :
 | ||
|           self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
 | ||
|           self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
 | ||
|           result['all'] = result['all'] + 1
 | ||
|           result['del'] = result['del'] + 1
 | ||
|         result['lab'].insert(0, lab[i])
 | ||
|         result['rec'].insert(0, "")
 | ||
|         i = i - 1
 | ||
|       elif self.space[i][j]['error'] == 'ins' : # insertion
 | ||
|         if len(rec[j]) > 0 :
 | ||
|           self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
 | ||
|           result['ins'] = result['ins'] + 1
 | ||
|         result['lab'].insert(0, "")
 | ||
|         result['rec'].insert(0, rec[j])
 | ||
|         j = j - 1
 | ||
|       elif self.space[i][j]['error'] == 'non' : # starting point
 | ||
|         break
 | ||
|       else : # shouldn't reach here
 | ||
|         print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error']))
 | ||
|     return result
 | ||
|   def overall(self) :
 | ||
|     result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
 | ||
|     for token in self.data :
 | ||
|       result['all'] = result['all'] + self.data[token]['all']
 | ||
|       result['cor'] = result['cor'] + self.data[token]['cor']
 | ||
|       result['sub'] = result['sub'] + self.data[token]['sub']
 | ||
|       result['ins'] = result['ins'] + self.data[token]['ins']
 | ||
|       result['del'] = result['del'] + self.data[token]['del']
 | ||
|     return result
 | ||
|   def cluster(self, data) :
 | ||
|     result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
 | ||
|     for token in data :
 | ||
|       if token in self.data :
 | ||
|         result['all'] = result['all'] + self.data[token]['all']
 | ||
|         result['cor'] = result['cor'] + self.data[token]['cor']
 | ||
|         result['sub'] = result['sub'] + self.data[token]['sub']
 | ||
|         result['ins'] = result['ins'] + self.data[token]['ins']
 | ||
|         result['del'] = result['del'] + self.data[token]['del']
 | ||
|     return result
 | ||
|   def keys(self) :
 | ||
|       return list(self.data.keys())
 | ||
| 
 | ||
| def width(string):
 | ||
|   return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
 | ||
| 
 | ||
| def default_cluster(word) :
 | ||
|   unicode_names = [ unicodedata.name(char) for char in word ]
 | ||
|   for i in reversed(range(len(unicode_names))) :
 | ||
|     if unicode_names[i].startswith('DIGIT') :  # 1
 | ||
|       unicode_names[i] = 'Number'  # 'DIGIT'
 | ||
|     elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
 | ||
|           unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) :
 | ||
|       # 明 / 郎
 | ||
|       unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
 | ||
|     elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
 | ||
|           unicode_names[i].startswith('LATIN SMALL LETTER')) :
 | ||
|       # A / a
 | ||
|       unicode_names[i] = 'English'  # 'LATIN LETTER'
 | ||
|     elif unicode_names[i].startswith('HIRAGANA LETTER') :  # は こ め
 | ||
|       unicode_names[i] = 'Japanese'  # 'GANA LETTER'
 | ||
|     elif (unicode_names[i].startswith('AMPERSAND') or
 | ||
|           unicode_names[i].startswith('APOSTROPHE') or
 | ||
|           unicode_names[i].startswith('COMMERCIAL AT') or
 | ||
|           unicode_names[i].startswith('DEGREE CELSIUS') or
 | ||
|           unicode_names[i].startswith('EQUALS SIGN') or
 | ||
|           unicode_names[i].startswith('FULL STOP') or
 | ||
|           unicode_names[i].startswith('HYPHEN-MINUS') or
 | ||
|           unicode_names[i].startswith('LOW LINE') or
 | ||
|           unicode_names[i].startswith('NUMBER SIGN') or
 | ||
|           unicode_names[i].startswith('PLUS SIGN') or
 | ||
|           unicode_names[i].startswith('SEMICOLON')) :
 | ||
|       # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
 | ||
|       del unicode_names[i]
 | ||
|     else :
 | ||
|       return 'Other'
 | ||
|   if len(unicode_names) == 0 :
 | ||
|       return 'Other'
 | ||
|   if len(unicode_names) == 1 :
 | ||
|       return unicode_names[0]
 | ||
|   for i in range(len(unicode_names)-1) :
 | ||
|     if unicode_names[i] != unicode_names[i+1] :
 | ||
|       return 'Other'
 | ||
|   return unicode_names[0]
 | ||
| 
 | ||
| def usage() :
 | ||
|   print("compute-wer.py : compute word error rate (WER) and align recognition results and references.")
 | ||
|   print("         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer")
 | ||
| 
 | ||
| if __name__ == '__main__':
 | ||
|   if len(sys.argv) == 1 :
 | ||
|     usage()
 | ||
|     sys.exit(0)
 | ||
|   calculator = Calculator()
 | ||
|   cluster_file = ''
 | ||
|   ignore_words = set()
 | ||
|   tochar = False
 | ||
|   verbose= 1
 | ||
|   padding_symbol= ' '
 | ||
|   case_sensitive = False
 | ||
|   max_words_per_line = sys.maxsize
 | ||
|   split = None
 | ||
|   while len(sys.argv) > 3:
 | ||
|      a = '--maxw='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|         b = sys.argv[1][len(a):]
 | ||
|         del sys.argv[1]
 | ||
|         max_words_per_line = int(b)
 | ||
|         continue
 | ||
|      a = '--rt='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|         b = sys.argv[1][len(a):].lower()
 | ||
|         del sys.argv[1]
 | ||
|         remove_tag = (b == 'true') or (b != '0')
 | ||
|         continue
 | ||
|      a = '--cs='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|         b = sys.argv[1][len(a):].lower()
 | ||
|         del sys.argv[1]
 | ||
|         case_sensitive = (b == 'true') or (b != '0')
 | ||
|         continue
 | ||
|      a = '--cluster='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|        cluster_file = sys.argv[1][len(a):]
 | ||
|        del sys.argv[1]
 | ||
|        continue
 | ||
|      a = '--splitfile='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|        split_file = sys.argv[1][len(a):]
 | ||
|        del sys.argv[1]
 | ||
|        split = dict()
 | ||
|        with codecs.open(split_file, 'r', 'utf-8') as fh:
 | ||
|          for line in fh:  # line in unicode
 | ||
|            words = line.strip().split()
 | ||
|            if len(words) >= 2:
 | ||
|              split[words[0]] = words[1:]
 | ||
|        continue
 | ||
|      a = '--ig='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|        ignore_file = sys.argv[1][len(a):]
 | ||
|        del sys.argv[1]
 | ||
|        with codecs.open(ignore_file, 'r', 'utf-8') as fh:
 | ||
|          for line in fh:  # line in unicode
 | ||
|            line = line.strip()
 | ||
|            if len(line) > 0:
 | ||
|              ignore_words.add(line)
 | ||
|        continue
 | ||
|      a = '--char='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|         b = sys.argv[1][len(a):].lower()
 | ||
|         del sys.argv[1]
 | ||
|         tochar = (b == 'true') or (b != '0')
 | ||
|         continue
 | ||
|      a = '--v='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|         b = sys.argv[1][len(a):].lower()
 | ||
|         del sys.argv[1]
 | ||
|         verbose=0
 | ||
|         try:
 | ||
|           verbose=int(b)
 | ||
|         except:
 | ||
|            if b == 'true' or b != '0':
 | ||
|               verbose = 1
 | ||
|         continue
 | ||
|      a = '--padding-symbol='
 | ||
|      if sys.argv[1].startswith(a):
 | ||
|         b = sys.argv[1][len(a):].lower()
 | ||
|         del sys.argv[1]
 | ||
|         if b == 'space':
 | ||
|           padding_symbol= ' '
 | ||
|         elif b == 'underline':
 | ||
|           padding_symbol= '_'
 | ||
|         continue
 | ||
|      if True or sys.argv[1].startswith('-'):
 | ||
|         #ignore invalid switch
 | ||
|         del sys.argv[1]
 | ||
|         continue
 | ||
| 
 | ||
|   if not case_sensitive:
 | ||
|      ig=set([w.upper() for w in ignore_words])
 | ||
|      ignore_words = ig
 | ||
| 
 | ||
|   default_clusters = {}
 | ||
|   default_words = {}
 | ||
| 
 | ||
|   ref_file = sys.argv[1]
 | ||
|   hyp_file = sys.argv[2]
 | ||
|   rec_set = {}
 | ||
|   if split and not case_sensitive:
 | ||
|      newsplit = dict()
 | ||
|      for w in split:
 | ||
|         words = split[w]
 | ||
|         for i in range(len(words)):
 | ||
|            words[i] = words[i].upper()
 | ||
|         newsplit[w.upper()] = words
 | ||
|      split = newsplit
 | ||
| 
 | ||
|   with codecs.open(hyp_file, 'r', 'utf-8') as fh:
 | ||
|      for line in fh:
 | ||
|         if tochar:
 | ||
|             array = characterize(line)
 | ||
|         else:
 | ||
|             array = line.strip().split()
 | ||
|         if len(array)==0: continue
 | ||
|         fid = array[0]
 | ||
|         rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
 | ||
| 
 | ||
|   # compute error rate on the interaction of reference file and hyp file
 | ||
|   for line in open(ref_file, 'r', encoding='utf-8') :
 | ||
|     if tochar:
 | ||
|           array = characterize(line)
 | ||
|     else:
 | ||
|           array = line.rstrip('\n').split()
 | ||
|     if len(array)==0: continue
 | ||
|     fid = array[0]
 | ||
|     if fid not in rec_set:
 | ||
|        continue
 | ||
|     lab = normalize(array[1:], ignore_words, case_sensitive, split)
 | ||
|     rec = rec_set[fid]
 | ||
|     if verbose:
 | ||
|       print('\nutt: %s' % fid)
 | ||
| 
 | ||
|     for word in rec + lab :
 | ||
|       if word not in default_words :
 | ||
|          default_cluster_name = default_cluster(word)
 | ||
|          if default_cluster_name not in default_clusters :
 | ||
|            default_clusters[default_cluster_name] = {}
 | ||
|          if word not in default_clusters[default_cluster_name] :
 | ||
|            default_clusters[default_cluster_name][word] = 1
 | ||
|          default_words[word] = default_cluster_name
 | ||
| 
 | ||
|     result = calculator.calculate(lab, rec)
 | ||
|     if verbose:
 | ||
|       if result['all'] != 0 :
 | ||
|         wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
 | ||
|       else :
 | ||
|         wer = 0.0
 | ||
|       print('WER: %4.2f %%' % wer, end = ' ')
 | ||
|       print('N=%d C=%d S=%d D=%d I=%d' %
 | ||
|           (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
 | ||
|       space = {}
 | ||
|       space['lab'] = []
 | ||
|       space['rec'] = []
 | ||
|       for idx in range(len(result['lab'])) :
 | ||
|         len_lab = width(result['lab'][idx])
 | ||
|         len_rec = width(result['rec'][idx])
 | ||
|         length = max(len_lab, len_rec)
 | ||
|         space['lab'].append(length-len_lab)
 | ||
|         space['rec'].append(length-len_rec)
 | ||
|       upper_lab = len(result['lab'])
 | ||
|       upper_rec = len(result['rec'])
 | ||
|       lab1, rec1 = 0, 0
 | ||
|       while lab1 < upper_lab or rec1 < upper_rec:
 | ||
|          if verbose > 1:
 | ||
|              print('lab(%s):' % fid.encode('utf-8'), end = ' ')
 | ||
|          else:
 | ||
|              print('lab:', end = ' ')
 | ||
|          lab2 = min(upper_lab, lab1 + max_words_per_line)
 | ||
|          for idx in range(lab1, lab2):
 | ||
|            token = result['lab'][idx]
 | ||
|            print('{token}'.format(token = token), end = '')
 | ||
|            for n in range(space['lab'][idx]) :
 | ||
|              print(padding_symbol, end = '')
 | ||
|            print(' ',end='')
 | ||
|          print()
 | ||
|          if verbose > 1:
 | ||
|             print('rec(%s):' % fid.encode('utf-8'), end = ' ')
 | ||
|          else:
 | ||
|             print('rec:', end = ' ')
 | ||
|          rec2 = min(upper_rec, rec1 + max_words_per_line)
 | ||
|          for idx in range(rec1, rec2):
 | ||
|            token = result['rec'][idx]
 | ||
|            print('{token}'.format(token = token), end = '')
 | ||
|            for n in range(space['rec'][idx]) :
 | ||
|              print(padding_symbol, end = '')
 | ||
|            print(' ',end='')
 | ||
|          print('\n', end='\n')
 | ||
|          lab1 = lab2
 | ||
|          rec1 = rec2
 | ||
| 
 | ||
|   if verbose:
 | ||
|     print('===========================================================================')
 | ||
|     print()
 | ||
| 
 | ||
|   result = calculator.overall()
 | ||
|   if result['all'] != 0 :
 | ||
|     wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
 | ||
|   else :
 | ||
|     wer = 0.0
 | ||
|   print('Overall -> %4.2f %%' % wer, end = ' ')
 | ||
|   print('N=%d C=%d S=%d D=%d I=%d' %
 | ||
|         (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
 | ||
|   if not verbose:
 | ||
|      print()
 | ||
| 
 | ||
|   if verbose:
 | ||
|    for cluster_id in default_clusters :
 | ||
|      result = calculator.cluster([ k for k in default_clusters[cluster_id] ])
 | ||
|      if result['all'] != 0 :
 | ||
|         wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
 | ||
|      else :
 | ||
|         wer = 0.0
 | ||
|      print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ')
 | ||
|      print('N=%d C=%d S=%d D=%d I=%d' %
 | ||
|           (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
 | ||
|    if len(cluster_file) > 0 : # compute separated WERs for word clusters
 | ||
|      cluster_id = ''
 | ||
|      cluster = []
 | ||
|      for line in open(cluster_file, 'r', encoding='utf-8') :
 | ||
|        for token in line.decode('utf-8').rstrip('\n').split() :
 | ||
|         # end of cluster reached, like </Keyword>
 | ||
|         if token[0:2] == '</' and token[len(token)-1] == '>' and \
 | ||
|            token.lstrip('</').rstrip('>') == cluster_id :
 | ||
|           result = calculator.cluster(cluster)
 | ||
|           if result['all'] != 0 :
 | ||
|             wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
 | ||
|           else :
 | ||
|             wer = 0.0
 | ||
|           print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ')
 | ||
|           print('N=%d C=%d S=%d D=d I=%d' %
 | ||
|                 (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
 | ||
|           cluster_id = ''
 | ||
|           cluster = []
 | ||
|         # begin of cluster reached, like <Keyword>
 | ||
|         elif token[0] == '<' and token[len(token)-1] == '>' and \
 | ||
|              cluster_id == '' :
 | ||
|           cluster_id = token.lstrip('<').rstrip('>')
 | ||
|           cluster = []
 | ||
|         # general terms, like WEATHER / CAR / ...
 | ||
|         else :
 | ||
|           cluster.append(token)
 | ||
|    print()
 | ||
|    print('===========================================================================')
 | 
