Common Sub-Expressions In Large Sheets

Typical Sheet

The following sheet, while grossly oversimplified, seems to be a very typical sheet for the insurance models used by insurance companies (a relative of mine is a mathematician at one of them). The typical things are the large table and its inhomogeneity (in practice, the tables are much larger than the one here). This is not a simple repeating section with a perfectly homogeneous structure.

	A	B	C	D	E	F	G	H	I
1	Inputs
2	Insured amount	$15,000.00 (InsuredAmount)
3	Age	21 (Age)
4
5	Outputs
6	Cost per year	480 =INDEX(G12:G48,B9) (InsuranceCostPerYear)
7
8	Helpers
9	Matching row	15 =MATCH(true,C12:C48,0.0)
10
11	Lookup Table	AgeUpTo	Match	AgeUpTo (Copy)	InsuredAmountUpTo	Percentage	InsuranceCostPerYear
12		20	false =AND(Age<=D12,InsuredAmount<=E12)	20 =B12	$5,000.00	3%	$150.00 =E12*F12
13			false =AND(Age<=D13,InsuredAmount<=E13)	20 =B12	$6,000.00	3%	$180.00 =E13*F13
14			false =AND(Age<=D14,InsuredAmount<=E14)	20 =B12	$8,000.00	3%	$240.00 =E14*F14
15			false =AND(Age<=D15,InsuredAmount<=E15)	20 =B12	$10,000.00	4%	$400.00 =E15*F15	Here we switch the percentage.
16			false =AND(Age<=D16,InsuredAmount<=E16)	20 =B12	$13,000.00	4%	$520.00 =E16*F16
17			false =AND(Age<=D17,InsuredAmount<=E17)	20 =B12	$16,000.00	4%	$640.00 =E17*F17
18			false =AND(Age<=D18,InsuredAmount<=E18)	20 =B12	$20,000.00	5%	$1,000.00 =E18*F18	Again.
19			false =AND(Age<=D19,InsuredAmount<=E19)	20 =B12	$25,000.00	5%	$1,250.00 =E19*F19
20			false =AND(Age<=D20,InsuredAmount<=E20)	20 =B12	$30,000.00	5%	$1,500.00 =E20*F20
21		22	false =AND(Age<=D21,InsuredAmount<=E21)	22 =B21	$5,000.00	3%	$150.00 =E21*F21	The next age group is similar, but with different percentages.
22			false =AND(Age<=D22,InsuredAmount<=E22)	22 =B21	$6,000.00	3%	$180.00 =E22*F22
23			false =AND(Age<=D23,InsuredAmount<=E23)	22 =B21	$8,000.00	3%	$240.00 =E23*F23
24			false =AND(Age<=D24,InsuredAmount<=E24)	22 =B21	$10,000.00	3%	$300.00 =E24*F24
25			false =AND(Age<=D25,InsuredAmount<=E25)	22 =B21	$13,000.00	3%	$390.00 =E25*F25
26			true =AND(Age<=D26,InsuredAmount<=E26)	22 =B21	$16,000.00	3%	$480.00 =E26*F26
27			true =AND(Age<=D27,InsuredAmount<=E27)	22 =B21	$20,000.00	4%	$800.00 =E27*F27
28			true =AND(Age<=D28,InsuredAmount<=E28)	22 =B21	$25,000.00	4%	$1,000.00 =E28*F28
29			true =AND(Age<=D29,InsuredAmount<=E29)	22 =B21	$30,000.00	4%	$1,200.00 =E29*F29
30		25	false =AND(Age<=D30,InsuredAmount<=E30)	25 =B30	$5,000.00	3%	$150.00 =E30*F30	Similar again, different percentages again.
31			false =AND(Age<=D31,InsuredAmount<=E31)	25 =B30	$6,000.00	3%	$180.00 =E31*F31
32			false =AND(Age<=D32,InsuredAmount<=E32)	25 =B30	$8,000.00	3%	$240.00 =E32*F32
33			false =AND(Age<=D33,InsuredAmount<=E33)	25 =B30	$10,000.00	3%	$300.00 =E33*F33
34			false =AND(Age<=D34,InsuredAmount<=E34)	25 =B30	$13,000.00	3%	$390.00 =E34*F34
35			true =AND(Age<=D35,InsuredAmount<=E35)	25 =B30	$16,000.00	3%	$480.00 =E35*F35
36			true =AND(Age<=D36,InsuredAmount<=E36)	25 =B30	$20,000.00	3%	$600.00 =E36*F36
37			true =AND(Age<=D37,InsuredAmount<=E37)	25 =B30	$25,000.00	3%	$750.00 =E37*F37
38			true =AND(Age<=D38,InsuredAmount<=E38)	25 =B30	$30,000.00	3%	$900.00 =E38*F38
39		60	true =AND(Age<=D39,InsuredAmount<=E39)	60 =B39	$30,000.00	3.1600% =2.0%+ABS(Age-50.0)/25.0*1.0%	$474.00 =InsuredAmount*F39	Complete switch of structure here.
40		200	false =AND(Age<=D40,InsuredAmount<=E40)	200 =B40	$5,000.00	3%	$450.00 =InsuredAmount*F40	Percentages again, but no fixed amounts.
41			false =AND(Age<=D41,InsuredAmount<=E41)	200 =B40	$6,000.00	3%	$450.00 =InsuredAmount*F41
42			false =AND(Age<=D42,InsuredAmount<=E42)	200 =B40	$8,000.00	3%	$450.00 =InsuredAmount*F42
43			false =AND(Age<=D43,InsuredAmount<=E43)	200 =B40	$10,000.00	4%	$600.00 =InsuredAmount*F43
44			false =AND(Age<=D44,InsuredAmount<=E44)	200 =B40	$13,000.00	4%	$600.00 =InsuredAmount*F44
45			true =AND(Age<=D45,InsuredAmount<=E45)	200 =B40	$16,000.00	4%	$600.00 =InsuredAmount*F45
46			true =AND(Age<=D46,InsuredAmount<=E46)	200 =B40	$20,000.00	5%	$750.00 =InsuredAmount*F46
47			true =AND(Age<=D47,InsuredAmount<=E47)	200 =B40	$25,000.00	5%	$750.00 =InsuredAmount*F47
48			true =AND(Age<=D48,InsuredAmount<=E48)	200 =B40	$30,000.00	5%	$750.00 =InsuredAmount*F48

Currently, SEJ compiles this to something like this:

public static final class OutputsCurrent implements Outputs
{
  private static final double[] COST_TABLE = { 150, 180, 240 /* ... all of 37 elts */};
  private final Inputs i;

  public OutputsCurrent( Inputs _inputs )
  {
    this.i = _inputs;
  }

  private double getAge()
  {
    return this.i.getAge();
  }

  private double getInsuredAmount()
  {
    return this.i.getInsuredAmount();
  }

  public double getInsuranceCostPerYear()
  {
    int r;
    switch (r = ((int) getMatchingRow()) - 1) {

      case 27:
        return getInsuredAmount() * (0.02 + Math.abs( getAge() - 50 ) / 25 * 0.01);

      case 28:
        return getInsuredAmount() * 0.03;
      case 29:
        return getInsuredAmount() * 0.03;
      case 30:
        return getInsuredAmount() * 0.03;
      case 31:
        return getInsuredAmount() * 0.04;
      case 32:
        return getInsuredAmount() * 0.04;
      case 33:
        return getInsuredAmount() * 0.04;

        /* ... for all rows of this form */

      default:
        if (r >= 0 && r < COST_TABLE.length) {
          return COST_TABLE[ r ];
        }
        return 0;
    }
  }

  private final double getMatchingRow()
  {
    int r = 1;

    // This is what really happens:
    if (((getAge() > 20.0 || getInsuredAmount() > 5000.0) ? 1.0 : 0.0) != 0.0) {
      r++;

      // This is what could be made to happen with a bit of peephole optimization:
      if (getAge() > 20.0 || getInsuredAmount() > 6000.0) {
        r++;

        if (getAge() > 20.0 || getInsuredAmount() > 8000.0) {
          r++;

          // ... for all of 37 cases */

        }
        else {
          r = 0;
        }
      }
    }

    return r;
  }

}

There are two stupid things here (apart from the possible peephole optimization noted in the code):

The method getMatchingRow() is a nightmare of redundancy and code bloat.
So is the repetition of return getInsuredAmount() * x in getInsuranceCostPerYear().

Now, code bloat now only means more code, it also means

less efficient use of high-speed CPU caches,
more JIT overhead, and
no clearly identifiable, small, and often-used hotspots in the code for the JITer to optimize heavily.

So the question is, how do we get more efficient code?

Homogenized Sheet

Now, this table could be made homogeneous, of course. It involves a formula type column in the table, and then another INDEX to select the proper computation variant to use (only the start of the sheet is shown):

	A	B	C	D	E	F	G	H	I	J	K
2	Insured amount	$15,000.00 (InsuredAmount)
3	Age	62 (Age)
4
5	Outputs
6	Cost per year	600 =VLOOKUP(true,C9:K45,9.0,false) (InsuranceCostPerYear)
7
8	Lookup Table	AgeUpTo	Match	AgeUpTo (Copy)	InsuredAmountUpTo	Percentage	Style	Style 1	Style 2	Style 3	Insured amount
9		20	false =AND(Age<=D9,InsuredAmount<=E9)	20 =B9	$5,000.00	3%	1	$150.00 =E9*F9	$372.00 =InsuredAmount(2.0%+ABS(Age-50.0)/25.01.0%)	$450.00 =InsuredAmount*F9	$150.00 =INDEX(H9:J9,G9)
10			false =AND(Age<=D10,InsuredAmount<=E10)	20 =B9	$6,000.00	3%	1	$180.00 =E10*F10	$372.00 =InsuredAmount(2.0%+ABS(Age-50.0)/25.01.0%)	$450.00 =InsuredAmount*F10	$180.00 =INDEX(JD10:JF10,G10)
11			false =AND(Age<=D11,InsuredAmount<=E11)	20 =B9	$8,000.00	3%	1	$240.00 =E11*F11	$372.00 =InsuredAmount(2.0%+ABS(Age-50.0)/25.01.0%)	$450.00 =InsuredAmount*F11	$240.00 =INDEX(JD11:JF11,G11)

Now we could simply declare a non-dynamic section over the table (a feature SEJ would have to learn), which would make VLOOKUP() iterate over a statically initialized section (from the constants found in the range) and return the final result for the first match.

This would perform well even if the static section were a sliding cursor across static double[] arrays (for very large tables). Only if we need more than one return value from the table would we run into trouble: the lookup methods only ever return a single value. So we would have to re-iterate the table for every needed value.

If the static section were held in an array of objects (rather than a sliding cursor), we could use an initial MATCH and then multiple calls to INDEX to work around this with decent performance (at the cost of slightly higher memory usage).

Normalized Sheet

The approach taken above is like relational normalization, really. Done more fully, we get:

	A	B	C	D	E	F	G
1	Inputs
2	Insured amount	$15,000.00 (InsuredAmount)
3	Age	62 (Age)
4
5	Outputs
6	Cost per year	600 =INDEX(B11:B13,B10) (InsuranceCostPerYear)
7
8	Helpers
9	Matching row	34 =MATCH(true,C16:C52,0.0)
10	Computation style	3 =INDEX(G16:G52,B9)
11	Style 1	$640.00 =INDEX(E16:E52,B9)*INDEX(F16:F52,B9)	Amount rounded up; percentage from table
12	Style 2	$372.00 =InsuredAmount(2.0%+ABS(Age-50.0)/25.01.0%)	Exact amount; percentage computed from age
13	Style 3	$600.00 =InsuredAmount*INDEX(F16:F52,B9)	Exact amount; percentage from table
14
15	Lookup Table	AgeUpTo	Match	AgeUpTo (Copy)	InsuredAmountUpTo	Percentage	Style
16		20	false =AND(Age<=D16,InsuredAmount<=E16)	20 =B16	$5,000.00	3%	1
17			false =AND(Age<=D17,InsuredAmount<=E17)	20 =B16	$6,000.00	3%	1
18			false =AND(Age<=D18,InsuredAmount<=E18)	20 =B16	$8,000.00	3%	1

If the static section is held in an array of objects (rather than a sliding cursor), we would get decent performance from this. The sliding cursor would have to re-iterate for every INDEX() we have here.

Typical Sheet Again

All this is not how typical Excel users think, however. They will much sooner come up with the first version of the sheet than the others.

So the question is, can we compile the first version to efficient code? Maybe by looking at the sheet in R1C1-notation. Then we clearly see that there are a lot of identical formulas referencing neighbouring cells.

So we could try to compile all shared formulas (after some threshold) to single methods with sheet, row, column parameters. Such a method could then access neighbours through some sort of dispatch routine which routes a sheet, row, column index to the appropriate cell getter. If we analyze for every shared formula the set of possible neighbours, we might get fairly compact and efficient dispatch methods, too.

Here’s a sketch of what I mean by this:

public static final class OutputsDesired implements Outputs
{
  private static final double[] COST_TABLE = { 150, 180, 240 /* ... all of 37 elts */};
  private static final double[] PERCENT_TABLE = { .03, .03, .03 /* ... all of 37 elts */};
  private static final double[] AGE_UPTO = { 20, 20, 20 /* ... all of 37 elts */};
  private static final double[] INSURED_AMOUNT_UPTO = { 5000, 6000, 8000 };

  private final Inputs i;

  public OutputsDesired( Inputs _inputs )
  {
    this.i = _inputs;
  }

  private double getAge()
  {
    return this.i.getAge();
  }

  private double getInsuredAmount()
  {
    return this.i.getInsuredAmount();
  }

  public double getInsuranceCostPerYear()
  {
    int r;
    switch (r = ((int) getMatchingRow()) - 1) {

      case 27:
        return getInsuredAmount() * (0.02 + Math.abs( getAge() - 50 ) / 25 * 0.01);

      case 28:
      case 29:
      case 30:
      case 31:
      case 32:
      case 33:
        return getStyle3( 0, r + 1, 7 );

      default:
        if (r >= 0 && r < COST_TABLE.length) {
          return COST_TABLE[ r ];
        }
        return 0;

    }
  }

  private double getStyle3( int _s, int _r, int _c )
  {
    return getInsuredAmount() * PERCENT_TABLE[ _r - 13 ];
  }

  private final double getMatchingRow()
  {
    for (int r = 13; r < 50; r++) {
      if (getMatch( 1, r, 3 )) return r;
    }
    return 0;
  }

  /**
   * The boolean return here is again a peephole optimization.
   */
  private final boolean getMatch( int _s, int _r, int _c )
  {
    return (getAge() <= getAgeUpto( _s, _r, _c + 1 ) && getInsuredAmount() <= getInsuredAmountUpto( _s, _r, _c + 2 ));
  }

  /**
   * If AFC sees that all the cells that can be referenced relatively are constant, then there
   * is no switch, just a table.
   */
  private double getAgeUpto( int _s, int _r, int _c )
  {
    return AGE_UPTO[ _r - 13 ];
  }

  private double getInsuredAmountUpto( int _s, int _r, int _c )
  {
    return INSURED_AMOUNT_UPTO[ _r - 13 ];
  }

}