How do I get pandas str.contains() to correctly select the rows with ‘Virginia’ and ‘West Virginia’?

I am trying to parse a csv that contains a state column. I want to make a csv for each individual state from the one aggregated csv. the code makes a dataframe for ‘Virginia’ and ‘West Virginia’ but the problem is that the ‘Virginia’ df also includes with it all the ‘West Virginia’ rows. Any ideas on how to fix this? I was able to solve the same issue with ‘Arkansas’ and ‘Kansas’ by setting regex=False.

df = pd.read_csv(io.StringIO(stat.decode('utf-8')))

states = parse(df, 'state')

write_states(df, states)

def parse(df, suffix):
    df = df.sort_values(by=[suffix])
    df = df[suffix]
    df = df.drop_duplicates()
    df = [df for df in df]
    return df

def write_states(df, states):
    mk_dir('states')
    print(f"writing to '{os.path.join(os.getcwd(), 'states')}'")
    d = df
    s = tqdm(states, ncols=103, leave=False, ascii=' #')
    for state in s:
        s.set_description(state)
        df = d[d['state'].str.contains(state, regex=False)]
        dates = np.array(df['date'], dtype='datetime64')
        states = np.array(df['state'])
        total_cases = np.array(df['cases'], dtype='int64')
        total_deaths = np.array(df['deaths'], dtype='int64')
        new_cases = get_diff(total_cases)
        new_deaths = get_diff(total_deaths)
        df = pd.DataFrame({'date': dates, 'state': states, 'total cases': total_cases, 
            'total deaths': total_deaths, 'new cases': new_cases, 'new deaths': new_deaths})
        df.to_csv(f"states/{state}.csv", index=False)

Answer

What about adding ^ and $ to the regex? This should handle those ambiguities like West/Virginia, Ar/kansas, etc.

df = d[d['state'].str.contains(f'^{state}$', case=False)]

Leave a Reply

Your email address will not be published. Required fields are marked *