I am an absolute newbie to python. I have multiple news articles within one text file and there are 2000+ text files. Each news article starts with “Dow Jones Newswires DJDN” and ends with “(END) Dow Jones Newswires”.
There is a set of code extracting every contents between multiple “start” and “end” like this:
with open('./news_txt/A_2013.txt') as infile, open('./news_txt/A_2013_a.txt', 'w') as outfile: copy = False for line in infile: if line.strip() == "Dow Jones Newswires DJDN": copy = True continue elif line.strip() == "(END) Dow Jones Newswires": copy = False continue elif copy: outfile.write(line)
However, this code only applies to the situation in which 1) there is only one txt file; 2) all the extracted contents are store in a new txt file.
But what I want is 1) loop every txt files in a path; 2) each extracted content is being saved in a new txt file.
For example, if there are 10 news in a txt, after running the code I should get 10 new txt files storing each news.
import os, os.path folder_path = './news_txt' def num_to_letters(n): rs= while n >0: n-=1 n,r = divmod(n,26) rs.insert(0,chr(r+ord('a'))) return ''.join(rs) for file_name in os.listdir(folder_path): if not file_name.lower().endswith('.txt'): continue in_file_path = os.path.join(folder_path,file_name) with open(in_file_path,'r') as in_file: out_file = None num_out_files = 0 for line in in_file: if line.strip() == "Dow Jones Newswires DJDN": if out_file is None: num_out_files +=1 out_file_name = file_name[:-4]+'_'+num_to_letters(num_out_files)+'.txt' out_file_path = os.path.join(folder_path,out_file_name) out_file = open(out_file_path,'w') continue elif line.strip() == "(END) Dow Jones Newswires": if out_file is not None: out_file.close() out_file = None continue elif out_file is not None: out_file.write(line) if out_file is not None: out_file.close()