Python/Pandas finding missing items amongst similarly grouped data, nested iteration not efficient

I will start by saying I am VERY new to python/pandas. I have a dataframe with about 1.5 million rows of data and growing, below is an abstraction of the data. I am trying to find hosts in the same release and group, where a host is missing a path that other hosts have in common. My approach was to iterate through the data, it is not very efficient. Appreciate any feedback on different approach, or increasing performance on my current approach. Thank You


release group host missing_path ReferenceHosts
A one abc c:onethree def:ghi


release group host path
A one abc c:onetwo
A one def c:onetwo
A one def c:onethree
A one ghi c:onetwo
A one ghi c:onethree
A two… lots of groups
B… lots of releases

    #get unique list of releases
    list_releases = df['Release'].dropna().unique().tolist()

    #get unique list groups
    list_groups = df['Group'].dropna().unique().tolist()

    #build dictionary { group:[hosts], }
    lists_hosts = hosts_by_group(list_groups, df)

    #detect missing files
    audit_missing = find_missing_files(list_releases, lists_hosts, df)

overview = {"Release": [], "Group": [], "SubjectHost": [], "FileMissing": [], "ReferenceHosts": [], "Ref1Domain": [], "Ref2Domain": [], "SubjDomain": [], "Extension": []}

def generate_overview(grp, hst, ref, hi, ref1_domain, ref2_domain, subj_domain, df,release,hosts, checking_host, idx2):

    df1 = df[(df.Hostname == hosts[idx2]) & (df.Release == release)]
    df2 = df[(df.Hostname == hosts[hi]) & (df.Release == release)]
    merge = pd.merge(df1, df2, how="inner", on=["Path"]).dropna()
    merge2 = pd.merge(checking_host, merge, how="inner", on=["Path"]).dropna()

    files_not_found = merge[~merge["Path"].isin(merge2["Path"])].dropna()
    iter = files_not_found['Path'].tolist()
    count = files_not_found['Path'].count()
    if files_not_found.count().sum() > 0:
        for file in iter:
            ext = files_not_found.loc[files_not_found['Path'] == file, 'Extension_x'].item()            

 def missing_file_process(hosts,df, group, release):
        for idx1, host in enumerate(hosts):
            checking_host = df[(df.Hostname == host)]
            subj_domain = (checking_host.Domain.unique())[0]

            for idx2, host2 in enumerate(hosts):
                num_hosts = len(hosts)             
                ref = ''
                hosts_index = 0
                ref1_domain = ''
                ref2_domain = ''

                if num_hosts - idx2 < 2:
                    ref = hosts[idx2] + ":" + hosts[0]
                    hosts_index = 0
                    ref1_domain = (df[(df.Hostname == hosts[idx2])].Domain.unique())[0]
                    ref2_domain = (df[(df.Hostname == hosts[0])].Domain.unique())[0]
                if num_hosts - idx2 > 1:
                    ref = hosts[idx2] + ":" + hosts[idx2+1]
                    hosts_index = idx2+1
                    ref1_domain = (df[(df.Hostname == hosts[idx2])].Domain.unique())[0]
                    ref2_domain = (df[(df.Hostname == hosts[idx2+1])].Domain.unique())[0]
                generate_overview( group, host, ref, hosts_index, ref1_domain, ref2_domain, subj_domain, df, release,hosts, checking_host, idx2)

 def find_missing_files(releases, lists_hosts, df):
        for release in releases:

            for idx, (group, hosts) in enumerate(lists_hosts.items()):
                 if len(hosts) > 2:
                    missing_file_process(hosts,df, group, release)

        return pd.DataFrame(data=overview)


Try this:

# Find all unique paths in a Release Group
tmp1 = df.groupby(['release', 'group'])['path'].apply(set).to_frame('path1').reset_index()

# Find all unique paths per Host in a Release Group
tmp2 = df.groupby(['release', 'group', 'host'])['path'].apply(set).to_frame('path2').reset_index()

# Line them up and find the missing paths in the second set
tmp3 = pd.merge(tmp1, tmp2, how='right', on=['release', 'group'])
tmp3['missing'] = tmp3['path1'] - tmp3['path2']

# Filter for those hosts where some paths are missing
result = tmp3[tmp3['missing'] != set({})]


release group                      path1 host        path2        missing
      A   one {c:onethree, c:onetwo}  abc {c:onetwo} {c:onethree}

You can rearrange the columns to taste 😀