my code is ugly: extracting only the files I want from a list of files

My code gets the job done but it is ugly, too long and clumsy. I have to work through several thousand files which fall into 4 groups and I only want one specific type

I want: ‘.docx’

I do not want: ‘.pdf’, ‘SS.docx’, or ‘ss.docx’

I tried several if not but they did not really work. In the end I built lists of all file types and the anti-join them to the complete list one after another so that only the files I am interested remain.

Question:

is it possible to simplify my if elif block? Could this be done with less lines to directly get to only the files I need?

is it possible to pack the df generation into a loop instead of having to do it manually for each?

#List all dirs under given dirs and subdirs
import os
import pandas as pd
import glob
import docx
from docx.api import Document

#fixed variable
location = 'C:\Data_analysis\N_TRACKING'

#all lists
dirs_in_dir = []
SS_files_in_dir = []
ss_files_in_dir = []
pdfs_in_dir = []
targets_in_dir = []
all_files = []

#active mapping of the directory tree and the files in it : List all dirs under given dirs and subdirs and add to list dirs_in_dirs
# r=>root, d=>directories, f=>files

for r, d, f in os.walk(location):
   for item in d:
      if '' in item:
         dirs_in_dir.append(os.path.join(r, item))

for r, d, f in os.walk(location):
   for item in f:
      if '' in item:
         all_files.append(os.path.join(r, item))

#active mapping: list all pdfs and add to list pdfs_in_dir, 
                #list all SS containing files and add to list files_in_dir, 
                #list all.docx files and add to list targets_in_dir
                # r=>root, d=>directories, f=>files

for r, d, f in os.walk(location):
   for item in f:
          if '.pdf' in item:
             pdfs_in_dir.append(os.path.join(r, item))
          elif 'SS' in item:
            SS_files_in_dir.append(os.path.join(r, item))
          elif 'ss' in item:
            ss_files_in_dir.append(os.path.join(r, item))
          elif '.docx' in item:
            targets_in_dir.append(os.path.join(r, item))

#antijoin: step one creating df
SS_files_df = pd.DataFrame(SS_files_in_dir)
ss_files_df = pd.DataFrame(ss_files_in_dir)
pdfs_df = pd.DataFrame(pdfs_in_dir)
all_files_df = pd.DataFrame(all_files)

all_files_df.columns=['Files']
SS_files_df.columns=['Files']
ss_files_df.columns=['Files']
pdfs_df.columns=['Files']
all_files_df.columns=['Files']

#antijoin: step 2 subtract all other df from all_files_df 
#remove pdf df
no_pdfs = all_files_df.merge(pdfs_df, on='Files', how='left', indicator=True)
index_names = no_pdfs[no_pdfs['_merge'] == 'both'].index 
# drop these row indexes 
# from dataFrame 
no_pdfs.drop(index_names, inplace = True)  
no_pdfs.drop(['_merge'], axis = 1, inplace = True) 
no_ss = no_pdfs
#remove ss_files
no_ss = no_ss.merge(ss_files_df, on='Files', how='left', indicator=True)
index_names = no_ss[no_ss['_merge'] == 'both'].index 
# drop these row indexes 
# from dataFrame 
no_ss.drop(index_names, inplace = True) 
no_ss.drop(['_merge'], axis = 1, inplace = True) 
no_SS = no_ss
#remove SS_files
no_SS = no_SS.merge(SS_files_df, on='Files', how='left', indicator=True)
index_names = no_SS[no_SS['_merge'] == 'both'].index 
# drop these row indexes 
# from dataFrame 
no_SS.drop(index_names, inplace = True) 
no_SS.drop(['_merge'], axis = 1, inplace = True) 

Answer

Since you:

  • Only want ‘.docx’ (i.e. as determined by suffix)
  • Do not want: ‘.pdf’, ‘SS.docx’, or ‘ss.docx’ (i.e. fies with these endings)

This could be done more simply as follows.

Code–Option 1 using str endswith

import os

def find_docx(location):
    all_files = []    # Will contain found files

    # Walk top-down through directory tree
    for (root, dirs, files) in os.walk(location, topdown=True):
   
        for f in files:
            f = f.lower()              # Make conditional case insensitive
            if f.endswith('.pdf'):
                continue               # Skipping pdf
            if f.endswith('ss.docx'):
                continue              # Skipping ss.docx or SS.docx
            if f.endswith('.docx'):
                # Desired docx (undesired docx has been filtered out by previous conditional)
                all_files.append(os.path.join(root, f))
                
    return all_files

Code–Option 2 using Regular Expression

def find_docx(location):
    desired = re.compile(r".*?docx$", re.IGNORECASE)   # .docx suffix
    undesired = re.compile(r".*?(.pdf|ss.docx)$", flags = re.IGNORECASE) # pdf and ss.docx suffix
    all_files = []    # Will contain found files
    
    # Walk top-down through directory tree
    for (root, dirs, files) in os.walk(location, topdown=True):

        for f in files:
            if desired.match(f) and not undesired.match(f):
                # Matches desired and doesn't match undesired
                all_files.append(os.path.join(root, f))

    return all_files

Usage

Using either of the above find_docx functions.

location = r'C:Data_analysisN_TRACKING'
all_files = find_docx(location)

Leave a Reply

Your email address will not be published. Required fields are marked *