Convert nested JSON to CSV or table

I know this question has been asked many times but none of the answers satisfy my requirement. I want to dynamically convert any nested JSON to a CSV file or Dataframe. Some sample examples are:

input : {"menu": {
    "header": "SVG Viewer",
    "items": [
        {"id": "Open"},
        {"id": "OpenNew", "label": "Open New"},
        null,
        {"id": "ZoomIn", "label": "Zoom In"},
        {"id": "ZoomOut", "label": "Zoom Out"},
        {"id": "OriginalView", "label": "Original View"},
        null,
        {"id": "Quality"},
        {"id": "Pause"},
        {"id": "Mute"},
        null,
        {"id": "Find", "label": "Find..."},
        {"id": "FindAgain", "label": "Find Again"},
        {"id": "Copy"},
        {"id": "CopyAgain", "label": "Copy Again"},
        {"id": "CopySVG", "label": "Copy SVG"},
        {"id": "ViewSVG", "label": "View SVG"},
        {"id": "ViewSource", "label": "View Source"},
        {"id": "SaveAs", "label": "Save As"},
        null,
        {"id": "Help"},
        {"id": "About", "label": "About Adobe CVG Viewer..."}
    ]
}}

Output: enter image description here

input 2 : {"menu": {
  "id": "file",
  "value": "File",
  "popup": {
    "menuitem": [
      {"value": "New", "onclick": "CreateNewDoc()"},
      {"value": "Open", "onclick": "OpenDoc()"},
      {"value": "Close", "onclick": "CloseDoc()"}
    ]
  }
}}

Output 2: enter image description here

So far I have tried below code, which works fine but it explodes the list type data into columns, but I want it be exploded in rows.

from pandas.io.json import json_normalize
import pandas as pd


def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            
            for a in x:
                
                flatten(x[a], name + a + '.')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '.')
                i += 1
        else:
            
            out[str(name[:-1])] = str(x)

    flatten(y)
    return out
  
def start_explode(data):
    
  if type(data) is dict: 
    df = pd.DataFrame([flatten_json(data)])
  else:
    df = pd.DataFrame([flatten_json(x) for x in data])
  
  df = df.astype(str)
  return df

complex_json = {"menu": {
  "id": "file",
  "value": "File",
  "popup": {
    "menuitem": [
      {"value": "New", "onclick": "CreateNewDoc()"},
      {"value": "Open", "onclick": "OpenDoc()"},
      {"value": "Close", "onclick": "CloseDoc()"}
    ]
  }
}}
df = start_explode(complex_json['menu'])
display(df)

It gives output like below for one the above inputs:

enter image description here

Answer

  • standard techniques for dealing with nested json
    1. json_normalize()
    2. explode()
    3. apply(pd.Series)
  • finally some cleanup, drop unwanted rows and replace nan with empty string
import json
js = """{"menu": {
    "header": "SVG Viewer",
    "items": [
        {"id": "Open"},
        {"id": "OpenNew", "label": "Open New"},
        null,
        {"id": "ZoomIn", "label": "Zoom In"},
        {"id": "ZoomOut", "label": "Zoom Out"},
        {"id": "OriginalView", "label": "Original View"},
        null,
        {"id": "Quality"},
        {"id": "Pause"},
        {"id": "Mute"},
        null,
        {"id": "Find", "label": "Find..."},
        {"id": "FindAgain", "label": "Find Again"},
        {"id": "Copy"},
        {"id": "CopyAgain", "label": "Copy Again"},
        {"id": "CopySVG", "label": "Copy SVG"},
        {"id": "ViewSVG", "label": "View SVG"},
        {"id": "ViewSource", "label": "View Source"},
        {"id": "SaveAs", "label": "Save As"},
        null,
        {"id": "Help"},
        {"id": "About", "label": "About Adobe CVG Viewer..."}
    ]
}}"""

df = pd.json_normalize(json.loads(js)).explode("menu.items").reset_index(drop=True)
df.drop(columns=["menu.items"]).join(df["menu.items"].apply(pd.Series)).dropna(subset=["id"]).fillna("")

menu.header id label
0 SVG Viewer Open
1 SVG Viewer OpenNew Open New
3 SVG Viewer ZoomIn Zoom In
4 SVG Viewer ZoomOut Zoom Out
5 SVG Viewer OriginalView Original View
7 SVG Viewer Quality
8 SVG Viewer Pause
9 SVG Viewer Mute
11 SVG Viewer Find Find…
12 SVG Viewer FindAgain Find Again
13 SVG Viewer Copy
14 SVG Viewer CopyAgain Copy Again
15 SVG Viewer CopySVG Copy SVG
16 SVG Viewer ViewSVG View SVG
17 SVG Viewer ViewSource View Source
18 SVG Viewer SaveAs Save As
20 SVG Viewer Help
21 SVG Viewer About About Adobe CVG Viewer…

utility function

  • if you don’t want to name columns, but take first list column
  • identify first column that contains lists
  • explode() and apply(pd.Series) to that column
  • provided option to expand all lists
def normalize(js, expand_all=False):
    df = pd.json_normalize(json.loads(js) if type(js)==str else js)
    # get first column that contains lists
    col = df.applymap(type).astype(str).eq("<class 'list'>").all().idxmax()
    # explode list and expand embedded dictionaries
    df = df.explode(col).reset_index(drop=True)
    df = df.drop(columns=[col]).join(df[col].apply(pd.Series), rsuffix=f".{col}")
    # any lists left?
    if expand_all and df.applymap(type).astype(str).eq("<class 'list'>").any(axis=1).all():
        df = normalize(df.to_dict("records"))
    return df

js = """{ "id": "0001", "type": "donut", "name": "Cake", "ppu": 0.55, "batters": { "batter": [ { "id": "1001", "type": "Regular" }, { "id": "1002", "type": "Chocolate" }, { "id": "1003", "type": "Blueberry" }, { "id": "1004", "type": "Devil's Food" } ] }, "topping": [ { "id": "5001", "type": "None" }, { "id": "5002", "type": "Glazed" }, { "id": "5005", "type": "Sugar" } ] }"""

normalize(js, expand_all=True)

id type name ppu id.topping type.topping id.batters.batter type.batters.batter
0 0001 donut Cake 0.55 5001 None 1001 Regular
1 0001 donut Cake 0.55 5001 None 1002 Chocolate
2 0001 donut Cake 0.55 5001 None 1003 Blueberry
3 0001 donut Cake 0.55 5001 None 1004 Devil’s Food
4 0001 donut Cake 0.55 5002 Glazed 1001 Regular
5 0001 donut Cake 0.55 5002 Glazed 1002 Chocolate
6 0001 donut Cake 0.55 5002 Glazed 1003 Blueberry
7 0001 donut Cake 0.55 5002 Glazed 1004 Devil’s Food
8 0001 donut Cake 0.55 5005 Sugar 1001 Regular
9 0001 donut Cake 0.55 5005 Sugar 1002 Chocolate
10 0001 donut Cake 0.55 5005 Sugar 1003 Blueberry
11 0001 donut Cake 0.55 5005 Sugar 1004 Devil’s Food

consider each list independent

def n2(js):
    df = pd.json_normalize(json.loads(js))
    # columns that contain lists
    cols = [i for i, c in df.applymap(type).astype(str).eq("<class 'list'>").all().iteritems() if c]
    # use list from first row
    return pd.concat(
        [df.drop(columns=cols)]
        + [pd.json_normalize(df.loc[0, c]).pipe(lambda d: d.rename(columns={c2: f"{c}.{c2}" for c2 in d.columns}))
            for c in cols],
        axis=1,
    ).fillna("")