My code gets the job done but it is ugly, too long and clumsy. I have to work through several thousand files which fall into 4 groups and I only want one specific type
I want: ‘.docx’
I do not want: ‘.pdf’, ‘SS.docx’, or ‘ss.docx’
I tried several if not
but they did not really work. In the end I built lists of all file types and the anti-join them to the complete list one after another so that only the files I am interested remain.
Question:
is it possible to simplify my if elif block? Could this be done with less lines to directly get to only the files I need?
is it possible to pack the df generation into a loop instead of having to do it manually for each?
#List all dirs under given dirs and subdirs import os import pandas as pd import glob import docx from docx.api import Document #fixed variable location = 'C:\Data_analysis\N_TRACKING' #all lists dirs_in_dir = [] SS_files_in_dir = [] ss_files_in_dir = [] pdfs_in_dir = [] targets_in_dir = [] all_files = [] #active mapping of the directory tree and the files in it : List all dirs under given dirs and subdirs and add to list dirs_in_dirs # r=>root, d=>directories, f=>files for r, d, f in os.walk(location): for item in d: if '' in item: dirs_in_dir.append(os.path.join(r, item)) for r, d, f in os.walk(location): for item in f: if '' in item: all_files.append(os.path.join(r, item)) #active mapping: list all pdfs and add to list pdfs_in_dir, #list all SS containing files and add to list files_in_dir, #list all.docx files and add to list targets_in_dir # r=>root, d=>directories, f=>files for r, d, f in os.walk(location): for item in f: if '.pdf' in item: pdfs_in_dir.append(os.path.join(r, item)) elif 'SS' in item: SS_files_in_dir.append(os.path.join(r, item)) elif 'ss' in item: ss_files_in_dir.append(os.path.join(r, item)) elif '.docx' in item: targets_in_dir.append(os.path.join(r, item)) #antijoin: step one creating df SS_files_df = pd.DataFrame(SS_files_in_dir) ss_files_df = pd.DataFrame(ss_files_in_dir) pdfs_df = pd.DataFrame(pdfs_in_dir) all_files_df = pd.DataFrame(all_files) all_files_df.columns=['Files'] SS_files_df.columns=['Files'] ss_files_df.columns=['Files'] pdfs_df.columns=['Files'] all_files_df.columns=['Files'] #antijoin: step 2 subtract all other df from all_files_df #remove pdf df no_pdfs = all_files_df.merge(pdfs_df, on='Files', how='left', indicator=True) index_names = no_pdfs[no_pdfs['_merge'] == 'both'].index # drop these row indexes # from dataFrame no_pdfs.drop(index_names, inplace = True) no_pdfs.drop(['_merge'], axis = 1, inplace = True) no_ss = no_pdfs #remove ss_files no_ss = no_ss.merge(ss_files_df, on='Files', how='left', indicator=True) index_names = no_ss[no_ss['_merge'] == 'both'].index # drop these row indexes # from dataFrame no_ss.drop(index_names, inplace = True) no_ss.drop(['_merge'], axis = 1, inplace = True) no_SS = no_ss #remove SS_files no_SS = no_SS.merge(SS_files_df, on='Files', how='left', indicator=True) index_names = no_SS[no_SS['_merge'] == 'both'].index # drop these row indexes # from dataFrame no_SS.drop(index_names, inplace = True) no_SS.drop(['_merge'], axis = 1, inplace = True)
Answer
Since you:
- Only want ‘.docx’ (i.e. as determined by suffix)
- Do not want: ‘.pdf’, ‘SS.docx’, or ‘ss.docx’ (i.e. fies with these endings)
This could be done more simply as follows.
Code–Option 1 using str endswith
import os def find_docx(location): all_files = [] # Will contain found files # Walk top-down through directory tree for (root, dirs, files) in os.walk(location, topdown=True): for f in files: f = f.lower() # Make conditional case insensitive if f.endswith('.pdf'): continue # Skipping pdf if f.endswith('ss.docx'): continue # Skipping ss.docx or SS.docx if f.endswith('.docx'): # Desired docx (undesired docx has been filtered out by previous conditional) all_files.append(os.path.join(root, f)) return all_files
Code–Option 2 using Regular Expression
def find_docx(location): desired = re.compile(r".*?docx$", re.IGNORECASE) # .docx suffix undesired = re.compile(r".*?(.pdf|ss.docx)$", flags = re.IGNORECASE) # pdf and ss.docx suffix all_files = [] # Will contain found files # Walk top-down through directory tree for (root, dirs, files) in os.walk(location, topdown=True): for f in files: if desired.match(f) and not undesired.match(f): # Matches desired and doesn't match undesired all_files.append(os.path.join(root, f)) return all_files
Usage
Using either of the above find_docx functions.
location = r'C:Data_analysisN_TRACKING' all_files = find_docx(location)