114 lines
3.6 KiB
Python
114 lines
3.6 KiB
Python
import re
|
|
import time
|
|
|
|
from tqdm import tqdm
|
|
from num2words import num2words
|
|
|
|
from multiprocessing import Pool, Queue
|
|
|
|
def buf_count_newlines_gen(fname):
|
|
def _make_gen(reader):
|
|
while True:
|
|
b = reader(2 ** 16)
|
|
if not b: break
|
|
yield b
|
|
|
|
with open(fname, "rb") as f:
|
|
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
|
|
return count
|
|
|
|
def number_to_words(line:str):
|
|
def transform(match):
|
|
number_str = match.group()
|
|
# print('\t' + number_str, end=' ')
|
|
|
|
number_str = re.sub(r'\.+', '.', number_str)
|
|
if number_str.endswith('.'):
|
|
number_str = number_str[:-1]
|
|
|
|
# check if number is a percentage
|
|
if '%' in number_str:
|
|
new_number_str = num2words(re.sub('\.?[$%\b]+', '', number_str)) + ' percent'
|
|
# check if number is a valid year
|
|
elif re.match(r'\b^\d{4}\b', number_str) and 1800 <= int(re.sub('[^\d]+', '', number_str)) <= 2100:
|
|
new_number_str = num2words(re.sub('[^\d]+', '', number_str), to='year')
|
|
# check if multiple '.' in number (e.g. 3.1.1)
|
|
elif number_str.count('.') > 0:
|
|
|
|
new_number_str = ' point '.join([num2words(re.sub('[^\d]+', '', num)) for num in number_str.split('.')])
|
|
else:
|
|
new_number_str = num2words(re.sub('[^\d.]+', ' ', number_str))
|
|
# print(new_number_str)
|
|
return ' ' + new_number_str + ' '
|
|
|
|
new_line = re.sub(r'\$?[\d]+[\d\.]*%?', transform, line)
|
|
new_line = re.sub(r'\s+', ' ', new_line)
|
|
return new_line.strip()
|
|
|
|
def process_line(input_queue:Queue, output_queue:Queue, error_queue:Queue):
|
|
while True:
|
|
line = input_queue.get(True)
|
|
|
|
line = line.strip()[:-1]
|
|
if '...' in line:
|
|
error_queue.put(line)
|
|
continue
|
|
|
|
try:
|
|
# Replace numbers with words
|
|
new_line = number_to_words(line)
|
|
new_line = re.sub(r'[^a-zA-z0-9\' ]', '', new_line)
|
|
new_line = re.sub(r'\s+', ' ', new_line)
|
|
output_queue.put(new_line)
|
|
except:
|
|
error_queue.put(line)
|
|
|
|
def write_queue_to_file(queue:Queue, file:str, wait_for_queue:Queue):
|
|
while not wait_for_queue.empty():
|
|
time.sleep(1)
|
|
with open(file, 'a') as f:
|
|
while not queue.empty():
|
|
f.write(queue.get() + '\n')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
SOURCE_FILE = 'financial-reports-sec.txt'
|
|
OUTPUT_FILE = 'financial-reports-sec_processed.txt'
|
|
ERROR_FILE = 'financial-reports-sec_error.txt'
|
|
|
|
total_line_count = buf_count_newlines_gen(SOURCE_FILE)
|
|
|
|
input_queue = Queue()
|
|
output_queue = Queue()
|
|
error_queue = Queue()
|
|
pool = Pool(20, process_line, (input_queue, output_queue, error_queue,))
|
|
|
|
start_time = time.time()
|
|
|
|
# Clear files
|
|
open(OUTPUT_FILE, 'w').close()
|
|
open(ERROR_FILE, 'w').close()
|
|
|
|
pb = tqdm(total=total_line_count)
|
|
with open(SOURCE_FILE, 'r') as fp:
|
|
for i, line in enumerate(fp):
|
|
input_queue.put(line)
|
|
|
|
if i % 100 == 0 and i != 0:
|
|
pb.update(100)
|
|
pb.update(total_line_count % 100)
|
|
pb.close()
|
|
|
|
print('Finished reading file, processing...')
|
|
pb = tqdm(total=total_line_count)
|
|
while not input_queue.empty():
|
|
time.sleep(1)
|
|
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
|
|
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
|
|
pb.close()
|
|
|
|
print('Finished processing file, writing to file...')
|
|
write_queue_to_file(output_queue, OUTPUT_FILE, input_queue)
|
|
write_queue_to_file(error_queue, ERROR_FILE, input_queue)
|
|
|
|
pool.close() |