How to parse JS code into one-operation-per-line with fewest temp variables?

Say I have this code:

// Walk GF(2^8)
var x = 0;
var xi = 0;
for (var i = 0; i < 256; i++) {
    // Compute sbox
    var sx = xi ^ (xi << 1) ^ (xi << 2) ^ (xi << 3) ^ (xi << 4);
    sx = (sx >>> 8) ^ (sx & 0xff) ^ 0x63;
    SBOX[x] = sx;
    INV_SBOX[sx] = x;

    // Compute multiplication
    var x2 = d[x];
    var x4 = d[x2];
    var x8 = d[x4];

    // Compute sub bytes, mix columns tables
    var t = (d[sx] * 0x101) ^ (sx * 0x1010100);
    SUB_MIX_0[x] = (t << 24) | (t >>> 8);
    SUB_MIX_1[x] = (t << 16) | (t >>> 16);
    SUB_MIX_2[x] = (t << 8)  | (t >>> 24);
    SUB_MIX_3[x] = t;

    // Compute inv sub bytes, inv mix columns tables
    var t = (x8 * 0x1010101) ^ (x4 * 0x10001) ^ (x2 * 0x101) ^ (x * 0x1010100);
    INV_SUB_MIX_0[sx] = (t << 24) | (t >>> 8);
    INV_SUB_MIX_1[sx] = (t << 16) | (t >>> 16);
    INV_SUB_MIX_2[sx] = (t << 8)  | (t >>> 24);
    INV_SUB_MIX_3[sx] = t;

    // Compute next counter
    if (!x) {
        x = xi = 1;
    } else {
        x = x2 ^ d[d[d[x8 ^ x2]]];
        xi ^= d[d[xi]];
    }
}

I am using meriyah to parse the AST. But just generally, since this is rather involved, how do you go about figuring out how many temp variables you need if you were to put every operation on one line?

So the first line of the loop:

var sx = xi ^ (xi << 1) ^ (xi << 2) ^ (xi << 3) ^ (xi << 4);

It would be made like:

var foo1 = xi << 1
var foo2 = xi << 2
var foo3 = xi << 3
var foo4 = xi << 4
var sx = xi ^ (foo1) ^ (foo2) ^ (foo3) ^ (foo4);

To become something like:

var foo1 = xi << 1
var foo2 = xi << 2
var foo3 = xi << 3
var foo4 = xi << 4
var foo6 = (foo3) ^ (foo4)
var foo7 = (foo2) ^ foo6
var foo8 = (foo1) ^ foo7
var sx = xi ^ foo8;

But better would be:

var foo1 = xi << 1
var foo2 = xi << 2
var foo3 = xi << 3
var foo4 = xi << 4
foo4 = (foo3) ^ (foo4)
foo4 = (foo2) ^ foo4
foo4 = (foo1) ^ foo4
var sx = xi ^ foo4;

So we minimize how many variables we create. How do you do this or even think about it programmatically?

AST for the line of code I tried is here:

{
  "type": "Program",
  "sourceType": "script",
  "body": [
    {
      "type": "VariableDeclaration",
      "kind": "var",
      "declarations": [
        {
          "type": "VariableDeclarator",
          "id": {
            "type": "Identifier",
            "name": "sx"
          },
          "init": {
            "type": "BinaryExpression",
            "left": {
              "type": "BinaryExpression",
              "left": {
                "type": "BinaryExpression",
                "left": {
                  "type": "BinaryExpression",
                  "left": {
                    "type": "Identifier",
                    "name": "xi"
                  },
                  "right": {
                    "type": "BinaryExpression",
                    "left": {
                      "type": "Identifier",
                      "name": "xi"
                    },
                    "right": {
                      "type": "Literal",
                      "value": 1
                    },
                    "operator": "<<"
                  },
                  "operator": "^"
                },
                "right": {
                  "type": "BinaryExpression",
                  "left": {
                    "type": "Identifier",
                    "name": "xi"
                  },
                  "right": {
                    "type": "Literal",
                    "value": 2
                  },
                  "operator": "<<"
                },
                "operator": "^"
              },
              "right": {
                "type": "BinaryExpression",
                "left": {
                  "type": "Identifier",
                  "name": "xi"
                },
                "right": {
                  "type": "Literal",
                  "value": 3
                },
                "operator": "<<"
              },
              "operator": "^"
            },
            "right": {
              "type": "BinaryExpression",
              "left": {
                "type": "Identifier",
                "name": "xi"
              },
              "right": {
                "type": "Literal",
                "value": 4
              },
              "operator": "<<"
            },
            "operator": "^"
          }
        }
      ]
    }
  ]
}

How would you handle this specific case in a generic way, so it could be generalized and used elsewhere (with a bit more work)?

Answer

First of all, you don’t need so many variables for var sx = xi ^ (xi << 1) ^ (xi << 2) ^ (xi << 3) ^ (xi << 4);

var tmp1 = (xi << 4);

var tmp2 = (xi << 3);
tmp2 = tmp2 ^ tmp1;

tmp1 = (xi << 2);
tmp1 = tmp1 ^ tmp2;

tmp2 = (xi << 1);
tmp2 = tmp2 ^ tmp1;

tmp2 = xi ^ tmp2;

If you count the “result” variable sx as a non-temporary one (since it has a declaration in code), then we are down to one declared and one temporary variable.

In any case, the AST for the “long expression” looks like this (at expression level, we don’t care to split xi << 4 into a node, since it has no sub-expressions):

                ┌────────── ^ ───────┐
            ┌───  ^ ──────────┐     xi << 4
      ┌───  ^ ─────────┐     xi << 3
┌───  ^ ────┐         xi << 2
xi         xi << 1

To traverse the tree (tl;dr Post-order tree traversal):

  1. We go to the deepest node (op: ^, left: xi, right: xi << 1) and store the expression into a temp variable. Technically, you could split this into two steps and handle the right expression first.

    var tmp1 = xi << 1

    No more sub-expressions in this node, so we can go ahead and “resolve” the level while reusing the same temp variable.

    tmp1 = xi ^ tmp1

  2. Go to the parent node. This level looks like op: ^, left: tmp1, right: xi << 2. Do the same thing, but we’ll have to create a new variable.

    tmp2 = xi << 2

    No more expressions, so:

    tmp1 = tmp1 ^ tmp2

    Observation: We can now discard tmp2, since we only need one variable to return the result.

  3. Parent node again: op: ^, left: tmp1, right: xi << 3

    tmp2 = xi << 3

    tmp1 = tmp1 ^ tmp2

    Observation: We have technically re-used the tmp2 from the child node.

  4. Same thing again: op: ^, left: tmp1, right: xi << 4

    tmp2 = xi << 4

    tmp1 = tmp1 ^ tmp2

  5. Done, the result is in tmp

This is the general logic. Some points that you have to keep in mind:

  1. You could have more than one sub-expression in one level. For example, if the deepest level was left: xi + 1, right: xi << 1. You would need extra temporary variables, then.

  2. You could have more than two children. E.g. a > 0 ? a + 5 : a - 5; or a function call f(a + 1, a + 2, a + 3). The conditional expression can technically only go one way, so one of the variables won’t be used, but I would give call expressions a thought. Combined with point 1, this can influence how many temporary variables you need for a node.

  3. The above two points can be combined to reuse variables. If you had to use 2 temps at the lowest level (left: xi + 1, right: xi << 1), then you can re-use the second temp in the parent nodes. You can keep a counter or a list of declared variables.

  4. Beware commas.

UPDATE

Here’s a sample that works for your expression. To support all JS expressions would of course be much more difficult: await expressions, function expressions, spread, etc.

const meriyah = require('meriyah');
const src = `var res = (!xi) ^ (xi << 1) ^ (xi << 2) ^ (xi << 3);`
const ast = meriyah.parse(src);
const longExpression = ast.body[0].declarations[0].init; // (!xi) ^ (xi << 1) ^ (xi << 2) ^ (xi << 3)

const state = {
    ops: [],
    nextTemp: 0,
    maxTempCount: 0,
};

function isSimple(node) {
    return node.type === 'Identifier' || node.type === 'Literal';
}

/**
 * @param {meriyah.ESTree.Expression} node
 */
function getChildren(node) {
    switch (node.type) {
        case 'BinaryExpression':
        case 'LogicalExpression':
            return [node.left, node.right];
        case 'CallExpression':
            return node.arguments;
        case 'UnaryExpression':
        case 'UpdateExpression':
            return [node.argument];
        default:
            throw new Error('Not supported.')
    }
}

/**
 * @param {meriyah.ESTree.Expression} node
 * @return string representation
 */
function simplify(node) {
    if (isSimple(node)) {
        // if it's a literal or identifier,
        // we don't need to do anything at all.
        // just return the string representation to print the ops
        return node.type === 'Literal'
                ? node.value
                : node.name;
    }

    // store current temp.
    // this will be reused so that we can "discard" any temps we create in this node
    const originalTemp = state.nextTemp;

    // simplify each of the children first.
    // for each child we'll need one temp var
    const children = getChildren(node);
    const simplified = children.map(node => {
        const stringRepr = simplify(node);
        state.nextTemp++;
        return stringRepr;
    });

    // update maximum temp variable count;
    state.maxTempCount = Math.max(state.nextTemp - 1, state.maxTempCount);
    // roll back the nextTemp;
    // the parent node will do nextTemp++ as needed for any further ops
    state.nextTemp = originalTemp;

    // and finally, do the operation for this node.
    // at this point, we reuse the original temp
    // e.g. tmp0 = op tpm0 tmp1 tmp2
    state.ops.push({op: node.operator || node.callee.name, args: simplified, temp: originalTemp});

    // the string representation for parent node operation
    return 'tmp' + state.nextTemp;
}

simplify(longExpression);

console.log('Vars needed: ', state.maxTempCount);
console.log(state.ops.map(({op, args, temp}) =>
    `tmp${temp} = ${op} ${args.join(' ')}`));

PS Feedback welcome, wizards of stack overflow. I’m sure all of this is well-trodden ground.