def filing_iterator(path_to_10K_folder): for file in sorted(glob.glob(path_to_10K_folder + '*')): text = open(file, 'rt').read() while len(text) > 200000: i = 199999 while text[i] != '\n': i -= 1 if i == 100000: i = i + len(text[i:200000].split()[0]) break yield text[0:i] text = text[i:] yield textdef name_iterator(path_to_10K_folder): for file in sorted(glob.glob(path_to_10K_folder + '*')): text = open(file, 'rt').read() name = file.split('/')[-1] while len(text) > 200000: i = 199999 while text[i] != '\n': i -= 1 if i == 100000: i = i + len(text[i:200000].split()[0]) break yield name text = text[i:] yield namedef reassemble_docs(docs, names): doc_names = [] compiled_docs = [] name_ids = {} # Determine which docs fit together for i in range(0, len(names)): name = names[i] if name not in name_ids: name_ids[name] = [i] else: name_ids[name].append(i) for name in name_ids: doc_names.append(name) doc = [] for id in name_ids[name]: doc += docs[id] compiled_docs.append(doc) return (compiled_docs, doc_names)# This lemmatizes words, drops stopwords, drops symbols, drops numbers, and lowercases words def cleaner(doc): return [token.lemma_.lower() for token in doc if (not token.is_stop) and (token.pos_ not in ['SYM', 'NUM', 'SPACE', 'PUNCT', 'X'])]# Define our pipe # We keep lemmatizer to lemmatize with, tagger as it is a dependency of tagger nlp = spacy.load("en_core_web_sm", exclude=['ner', 'parser']) # As there is a 10-K longer than 1M characters, up the max_length of a document to 2M. # spaCy needs up to 1GB of RAM per 1M characters to parse a file. nlp.max_length = 200000total_docs = len(list(filing_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/')))doc_names = list(name_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/'))
%%time # 27.5 minutes to run and as much as 500MB*n_process to process; most RAM usage released on completion # Benchmark time based on running on an AMD Ryzen 5900X CPU docs = [] for doc in tqdm(nlp.pipe(filing_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/'), batch_size=1, n_process=20), total=total_docs): docs.append(cleaner(doc))# Fix docs -- they were split into multiple list when greater than 200,000 characters in length docs, doc_names = reassemble_docs(docs, doc_names)doc_names = [doc.replace('2014\\', '') for doc in doc_names]# Save to a file with open('M:/Scratch/docs.pkl', 'wb') as f: pickle.dump([docs, doc_names], f, protocol=pickle.HIGHEST_PROTOCOL)with open('M:/Scratch/docs.pkl', 'rb') as f: docs, _ = pickle.load(f)