Common Sub-Expressions In Large Sheets
Typical Sheet
The following sheet, while grossly oversimplified, seems to be a very typical sheet for the insurance models used by insurance companies (a relative of mine is a mathematician at one of them). The typical things are the large table and its inhomogeneity (in practice, the tables are much larger than the one here). This is not a simple repeating section with a perfectly homogeneous structure.
A | B | C | D | E | F | G | H | I | |
1 | Inputs | ||||||||
2 | Insured amount | $15,000.00 (InsuredAmount) |
|||||||
3 | Age | 21 (Age) |
|||||||
4 | |||||||||
5 | Outputs | ||||||||
6 | Cost per year | 480 =INDEX(G12:G48,B9) (InsuranceCostPerYear) |
|||||||
7 | |||||||||
8 | Helpers | ||||||||
9 | Matching row | 15 =MATCH(true,C12:C48,0.0) |
|||||||
10 | |||||||||
11 | Lookup Table | AgeUpTo | Match | AgeUpTo (Copy) | InsuredAmountUpTo | Percentage | InsuranceCostPerYear | ||
12 | 20 | false =AND(Age<=D12,InsuredAmount<=E12) |
20 =B12 |
$5,000.00 | 3% | $150.00 =E12*F12 |
|||
13 | false =AND(Age<=D13,InsuredAmount<=E13) |
20 =B12 |
$6,000.00 | 3% | $180.00 =E13*F13 |
||||
14 | false =AND(Age<=D14,InsuredAmount<=E14) |
20 =B12 |
$8,000.00 | 3% | $240.00 =E14*F14 |
||||
15 | false =AND(Age<=D15,InsuredAmount<=E15) |
20 =B12 |
$10,000.00 | 4% | $400.00 =E15*F15 |
Here we switch the percentage. | |||
16 | false =AND(Age<=D16,InsuredAmount<=E16) |
20 =B12 |
$13,000.00 | 4% | $520.00 =E16*F16 |
||||
17 | false =AND(Age<=D17,InsuredAmount<=E17) |
20 =B12 |
$16,000.00 | 4% | $640.00 =E17*F17 |
||||
18 | false =AND(Age<=D18,InsuredAmount<=E18) |
20 =B12 |
$20,000.00 | 5% | $1,000.00 =E18*F18 |
Again. | |||
19 | false =AND(Age<=D19,InsuredAmount<=E19) |
20 =B12 |
$25,000.00 | 5% | $1,250.00 =E19*F19 |
||||
20 | false =AND(Age<=D20,InsuredAmount<=E20) |
20 =B12 |
$30,000.00 | 5% | $1,500.00 =E20*F20 |
||||
21 | 22 | false =AND(Age<=D21,InsuredAmount<=E21) |
22 =B21 |
$5,000.00 | 3% | $150.00 =E21*F21 |
The next age group is similar, but with different percentages. | ||
22 | false =AND(Age<=D22,InsuredAmount<=E22) |
22 =B21 |
$6,000.00 | 3% | $180.00 =E22*F22 |
||||
23 | false =AND(Age<=D23,InsuredAmount<=E23) |
22 =B21 |
$8,000.00 | 3% | $240.00 =E23*F23 |
||||
24 | false =AND(Age<=D24,InsuredAmount<=E24) |
22 =B21 |
$10,000.00 | 3% | $300.00 =E24*F24 |
||||
25 | false =AND(Age<=D25,InsuredAmount<=E25) |
22 =B21 |
$13,000.00 | 3% | $390.00 =E25*F25 |
||||
26 | true =AND(Age<=D26,InsuredAmount<=E26) |
22 =B21 |
$16,000.00 | 3% | $480.00 =E26*F26 |
||||
27 | true =AND(Age<=D27,InsuredAmount<=E27) |
22 =B21 |
$20,000.00 | 4% | $800.00 =E27*F27 |
||||
28 | true =AND(Age<=D28,InsuredAmount<=E28) |
22 =B21 |
$25,000.00 | 4% | $1,000.00 =E28*F28 |
||||
29 | true =AND(Age<=D29,InsuredAmount<=E29) |
22 =B21 |
$30,000.00 | 4% | $1,200.00 =E29*F29 |
||||
30 | 25 | false =AND(Age<=D30,InsuredAmount<=E30) |
25 =B30 |
$5,000.00 | 3% | $150.00 =E30*F30 |
Similar again, different percentages again. | ||
31 | false =AND(Age<=D31,InsuredAmount<=E31) |
25 =B30 |
$6,000.00 | 3% | $180.00 =E31*F31 |
||||
32 | false =AND(Age<=D32,InsuredAmount<=E32) |
25 =B30 |
$8,000.00 | 3% | $240.00 =E32*F32 |
||||
33 | false =AND(Age<=D33,InsuredAmount<=E33) |
25 =B30 |
$10,000.00 | 3% | $300.00 =E33*F33 |
||||
34 | false =AND(Age<=D34,InsuredAmount<=E34) |
25 =B30 |
$13,000.00 | 3% | $390.00 =E34*F34 |
||||
35 | true =AND(Age<=D35,InsuredAmount<=E35) |
25 =B30 |
$16,000.00 | 3% | $480.00 =E35*F35 |
||||
36 | true =AND(Age<=D36,InsuredAmount<=E36) |
25 =B30 |
$20,000.00 | 3% | $600.00 =E36*F36 |
||||
37 | true =AND(Age<=D37,InsuredAmount<=E37) |
25 =B30 |
$25,000.00 | 3% | $750.00 =E37*F37 |
||||
38 | true =AND(Age<=D38,InsuredAmount<=E38) |
25 =B30 |
$30,000.00 | 3% | $900.00 =E38*F38 |
||||
39 | 60 | true =AND(Age<=D39,InsuredAmount<=E39) |
60 =B39 |
$30,000.00 | 3.1600% =2.0%+ABS(Age-50.0)/25.0*1.0% |
$474.00 =InsuredAmount*F39 |
Complete switch of structure here. | ||
40 | 200 | false =AND(Age<=D40,InsuredAmount<=E40) |
200 =B40 |
$5,000.00 | 3% | $450.00 =InsuredAmount*F40 |
Percentages again, but no fixed amounts. | ||
41 | false =AND(Age<=D41,InsuredAmount<=E41) |
200 =B40 |
$6,000.00 | 3% | $450.00 =InsuredAmount*F41 |
||||
42 | false =AND(Age<=D42,InsuredAmount<=E42) |
200 =B40 |
$8,000.00 | 3% | $450.00 =InsuredAmount*F42 |
||||
43 | false =AND(Age<=D43,InsuredAmount<=E43) |
200 =B40 |
$10,000.00 | 4% | $600.00 =InsuredAmount*F43 |
||||
44 | false =AND(Age<=D44,InsuredAmount<=E44) |
200 =B40 |
$13,000.00 | 4% | $600.00 =InsuredAmount*F44 |
||||
45 | true =AND(Age<=D45,InsuredAmount<=E45) |
200 =B40 |
$16,000.00 | 4% | $600.00 =InsuredAmount*F45 |
||||
46 | true =AND(Age<=D46,InsuredAmount<=E46) |
200 =B40 |
$20,000.00 | 5% | $750.00 =InsuredAmount*F46 |
||||
47 | true =AND(Age<=D47,InsuredAmount<=E47) |
200 =B40 |
$25,000.00 | 5% | $750.00 =InsuredAmount*F47 |
||||
48 | true =AND(Age<=D48,InsuredAmount<=E48) |
200 =B40 |
$30,000.00 | 5% | $750.00 =InsuredAmount*F48 |
Currently, SEJ compiles this to something like this:
public static final class OutputsCurrent implements Outputs { private static final double[] COST_TABLE = { 150, 180, 240 /* ... all of 37 elts */}; private final Inputs i; public OutputsCurrent( Inputs _inputs ) { this.i = _inputs; } private double getAge() { return this.i.getAge(); } private double getInsuredAmount() { return this.i.getInsuredAmount(); } public double getInsuranceCostPerYear() { int r; switch (r = ((int) getMatchingRow()) - 1) { case 27: return getInsuredAmount() * (0.02 + Math.abs( getAge() - 50 ) / 25 * 0.01); case 28: return getInsuredAmount() * 0.03; case 29: return getInsuredAmount() * 0.03; case 30: return getInsuredAmount() * 0.03; case 31: return getInsuredAmount() * 0.04; case 32: return getInsuredAmount() * 0.04; case 33: return getInsuredAmount() * 0.04; /* ... for all rows of this form */ default: if (r >= 0 && r < COST_TABLE.length) { return COST_TABLE[ r ]; } return 0; } } private final double getMatchingRow() { int r = 1; // This is what really happens: if (((getAge() > 20.0 || getInsuredAmount() > 5000.0) ? 1.0 : 0.0) != 0.0) { r++; // This is what could be made to happen with a bit of peephole optimization: if (getAge() > 20.0 || getInsuredAmount() > 6000.0) { r++; if (getAge() > 20.0 || getInsuredAmount() > 8000.0) { r++; // ... for all of 37 cases */ } else { r = 0; } } } return r; } }
There are two stupid things here (apart from the possible peephole optimization noted in the code):
- The method
getMatchingRow()
is a nightmare of redundancy and code bloat. - So is the repetition of
return getInsuredAmount() * x
ingetInsuranceCostPerYear()
.
Now, code bloat now only means more code, it also means
- less efficient use of high-speed CPU caches,
- more JIT overhead, and
- no clearly identifiable, small, and often-used hotspots in the code for the JITer to optimize heavily.
So the question is, how do we get more efficient code?
Homogenized Sheet
Now, this table could be made homogeneous, of course. It involves a formula type column in the table, and then another INDEX
to select the proper computation variant to use (only the start of the sheet is shown):
A | B | C | D | E | F | G | H | I | J | K | |
2 | Insured amount | $15,000.00 (InsuredAmount) |
|||||||||
3 | Age | 62 (Age) |
|||||||||
4 | |||||||||||
5 | Outputs | ||||||||||
6 | Cost per year | 600 =VLOOKUP(true,C9:K45,9.0,false) (InsuranceCostPerYear) |
|||||||||
7 | |||||||||||
8 | Lookup Table | AgeUpTo | Match | AgeUpTo (Copy) | InsuredAmountUpTo | Percentage | Style | Style 1 | Style 2 | Style 3 | Insured amount |
9 | 20 | false =AND(Age<=D9,InsuredAmount<=E9) |
20 =B9 |
$5,000.00 | 3% | 1 | $150.00 =E9*F9 |
$372.00 =InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%) |
$450.00 =InsuredAmount*F9 |
$150.00 =INDEX(H9:J9,G9) |
|
10 | false =AND(Age<=D10,InsuredAmount<=E10) |
20 =B9 |
$6,000.00 | 3% | 1 | $180.00 =E10*F10 |
$372.00 =InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%) |
$450.00 =InsuredAmount*F10 |
$180.00 =INDEX(JD10:JF10,G10) |
||
11 | false =AND(Age<=D11,InsuredAmount<=E11) |
20 =B9 |
$8,000.00 | 3% | 1 | $240.00 =E11*F11 |
$372.00 =InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%) |
$450.00 =InsuredAmount*F11 |
$240.00 =INDEX(JD11:JF11,G11) |
Now we could simply declare a non-dynamic section over the table (a feature SEJ would have to learn), which would make VLOOKUP()
iterate over a statically initialized section (from the constants found in the range) and return the final result for the first match.
This would perform well even if the static section were a sliding cursor across static double[]
arrays (for very large tables). Only if we need more than one return value from the table would we run into trouble: the lookup methods only ever return a single value. So we would have to re-iterate the table for every needed value.
If the static section were held in an array of objects (rather than a sliding cursor), we could use an initial MATCH
and then multiple calls to INDEX
to work around this with decent performance (at the cost of slightly higher memory usage).
Normalized Sheet
The approach taken above is like relational normalization, really. Done more fully, we get:
A | B | C | D | E | F | G | |
1 | Inputs | ||||||
2 | Insured amount | $15,000.00 (InsuredAmount) |
|||||
3 | Age | 62 (Age) |
|||||
4 | |||||||
5 | Outputs | ||||||
6 | Cost per year | 600 =INDEX(B11:B13,B10) (InsuranceCostPerYear) |
|||||
7 | |||||||
8 | Helpers | ||||||
9 | Matching row | 34 =MATCH(true,C16:C52,0.0) |
|||||
10 | Computation style | 3 =INDEX(G16:G52,B9) |
|||||
11 | Style 1 | $640.00 =INDEX(E16:E52,B9)*INDEX(F16:F52,B9) |
Amount rounded up; percentage from table | ||||
12 | Style 2 | $372.00 =InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%) |
Exact amount; percentage computed from age | ||||
13 | Style 3 | $600.00 =InsuredAmount*INDEX(F16:F52,B9) |
Exact amount; percentage from table | ||||
14 | |||||||
15 | Lookup Table | AgeUpTo | Match | AgeUpTo (Copy) | InsuredAmountUpTo | Percentage | Style |
16 | 20 | false =AND(Age<=D16,InsuredAmount<=E16) |
20 =B16 |
$5,000.00 | 3% | 1 | |
17 | false =AND(Age<=D17,InsuredAmount<=E17) |
20 =B16 |
$6,000.00 | 3% | 1 | ||
18 | false =AND(Age<=D18,InsuredAmount<=E18) |
20 =B16 |
$8,000.00 | 3% | 1 |
If the static section is held in an array of objects (rather than a sliding cursor), we would get decent performance from this. The sliding cursor would have to re-iterate for every INDEX()
we have here.
Typical Sheet Again
All this is not how typical Excel users think, however. They will much sooner come up with the first version of the sheet than the others.
So the question is, can we compile the first version to efficient code? Maybe by looking at the sheet in R1C1-notation. Then we clearly see that there are a lot of identical formulas referencing neighbouring cells.
So we could try to compile all shared formulas (after some threshold) to single methods with sheet, row, column parameters. Such a method could then access neighbours through some sort of dispatch routine which routes a sheet, row, column index to the appropriate cell getter. If we analyze for every shared formula the set of possible neighbours, we might get fairly compact and efficient dispatch methods, too.
Here’s a sketch of what I mean by this:
public static final class OutputsDesired implements Outputs { private static final double[] COST_TABLE = { 150, 180, 240 /* ... all of 37 elts */}; private static final double[] PERCENT_TABLE = { .03, .03, .03 /* ... all of 37 elts */}; private static final double[] AGE_UPTO = { 20, 20, 20 /* ... all of 37 elts */}; private static final double[] INSURED_AMOUNT_UPTO = { 5000, 6000, 8000 }; private final Inputs i; public OutputsDesired( Inputs _inputs ) { this.i = _inputs; } private double getAge() { return this.i.getAge(); } private double getInsuredAmount() { return this.i.getInsuredAmount(); } public double getInsuranceCostPerYear() { int r; switch (r = ((int) getMatchingRow()) - 1) { case 27: return getInsuredAmount() * (0.02 + Math.abs( getAge() - 50 ) / 25 * 0.01); case 28: case 29: case 30: case 31: case 32: case 33: return getStyle3( 0, r + 1, 7 ); default: if (r >= 0 && r < COST_TABLE.length) { return COST_TABLE[ r ]; } return 0; } } private double getStyle3( int _s, int _r, int _c ) { return getInsuredAmount() * PERCENT_TABLE[ _r - 13 ]; } private final double getMatchingRow() { for (int r = 13; r < 50; r++) { if (getMatch( 1, r, 3 )) return r; } return 0; } /** * The boolean return here is again a peephole optimization. */ private final boolean getMatch( int _s, int _r, int _c ) { return (getAge() <= getAgeUpto( _s, _r, _c + 1 ) && getInsuredAmount() <= getInsuredAmountUpto( _s, _r, _c + 2 )); } /** * If AFC sees that all the cells that can be referenced relatively are constant, then there * is no switch, just a table. */ private double getAgeUpto( int _s, int _r, int _c ) { return AGE_UPTO[ _r - 13 ]; } private double getInsuredAmountUpto( int _s, int _r, int _c ) { return INSURED_AMOUNT_UPTO[ _r - 13 ]; } }