Files
b2txt25/language_model/utils/convert_number_to_words.py
2025-07-02 12:18:09 -07:00

114 lines
3.6 KiB
Python

import re
import time
from tqdm import tqdm
from num2words import num2words
from multiprocessing import Pool, Queue
def buf_count_newlines_gen(fname):
def _make_gen(reader):
while True:
b = reader(2 ** 16)
if not b: break
yield b
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
def number_to_words(line:str):
def transform(match):
number_str = match.group()
# print('\t' + number_str, end=' ')
number_str = re.sub(r'\.+', '.', number_str)
if number_str.endswith('.'):
number_str = number_str[:-1]
# check if number is a percentage
if '%' in number_str:
new_number_str = num2words(re.sub('\.?[$%\b]+', '', number_str)) + ' percent'
# check if number is a valid year
elif re.match(r'\b^\d{4}\b', number_str) and 1800 <= int(re.sub('[^\d]+', '', number_str)) <= 2100:
new_number_str = num2words(re.sub('[^\d]+', '', number_str), to='year')
# check if multiple '.' in number (e.g. 3.1.1)
elif number_str.count('.') > 0:
new_number_str = ' point '.join([num2words(re.sub('[^\d]+', '', num)) for num in number_str.split('.')])
else:
new_number_str = num2words(re.sub('[^\d.]+', ' ', number_str))
# print(new_number_str)
return ' ' + new_number_str + ' '
new_line = re.sub(r'\$?[\d]+[\d\.]*%?', transform, line)
new_line = re.sub(r'\s+', ' ', new_line)
return new_line.strip()
def process_line(input_queue:Queue, output_queue:Queue, error_queue:Queue):
while True:
line = input_queue.get(True)
line = line.strip()[:-1]
if '...' in line:
error_queue.put(line)
continue
try:
# Replace numbers with words
new_line = number_to_words(line)
new_line = re.sub(r'[^a-zA-z0-9\' ]', '', new_line)
new_line = re.sub(r'\s+', ' ', new_line)
output_queue.put(new_line)
except:
error_queue.put(line)
def write_queue_to_file(queue:Queue, file:str, wait_for_queue:Queue):
while not wait_for_queue.empty():
time.sleep(1)
with open(file, 'a') as f:
while not queue.empty():
f.write(queue.get() + '\n')
if __name__ == '__main__':
SOURCE_FILE = 'financial-reports-sec.txt'
OUTPUT_FILE = 'financial-reports-sec_processed.txt'
ERROR_FILE = 'financial-reports-sec_error.txt'
total_line_count = buf_count_newlines_gen(SOURCE_FILE)
input_queue = Queue()
output_queue = Queue()
error_queue = Queue()
pool = Pool(20, process_line, (input_queue, output_queue, error_queue,))
start_time = time.time()
# Clear files
open(OUTPUT_FILE, 'w').close()
open(ERROR_FILE, 'w').close()
pb = tqdm(total=total_line_count)
with open(SOURCE_FILE, 'r') as fp:
for i, line in enumerate(fp):
input_queue.put(line)
if i % 100 == 0 and i != 0:
pb.update(100)
pb.update(total_line_count % 100)
pb.close()
print('Finished reading file, processing...')
pb = tqdm(total=total_line_count)
while not input_queue.empty():
time.sleep(1)
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
pb.close()
print('Finished processing file, writing to file...')
write_queue_to_file(output_queue, OUTPUT_FILE, input_queue)
write_queue_to_file(error_queue, ERROR_FILE, input_queue)
pool.close()