From 41df1f8725f87f3ed8f83bf19d7408e83f787cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Mart=C3=AD?= Date: Sat, 7 Jan 2023 16:18:07 +0000 Subject: [PATCH] slightly reduce the range of hashed name lengths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.8.0 was improved to obfuscate names so that they didn't all end up with the same length. We went from a length fixed to 8 to a range between 8 and 15. Since the new mechanism chooses a length evenly between 8 and 15, the average hashed name length went from 8 to 11.5. While this improved obfuscation, it also increased the size of obfuscated binaries by quite a bit. We do need to use a reasonably large length to avoid collisions within packages, but we also don't want it to be large enough to cause too much bloat. Reduce the minimum and maximum from 8 and 15 to 6 and 12. This means that the average goes fro 11.5 to 9, much closer to the old average. We could consider a range of 4 to 12, but 4 bytes is short enough where collisions become likely in practice, even if it's just the minimum. And a range of 6 to 10 shrinks the range a bit too much. name old time/op new time/op delta Build-16 20.6s ± 0% 20.6s ± 0% ~ (p=0.421 n=5+5) name old bin-B new bin-B delta Build-16 5.77M ± 0% 5.66M ± 0% -1.92% (p=0.008 n=5+5) name old cached-time/op new cached-time/op delta Build-16 705ms ± 4% 703ms ± 2% ~ (p=0.690 n=5+5) name old mallocs/op new mallocs/op delta Build-16 25.0M ± 0% 25.0M ± 0% -0.08% (p=0.008 n=5+5) name old sys-time/op new sys-time/op delta Build-16 8.11s ± 2% 8.11s ± 2% ~ (p=0.841 n=5+5) Updates #618. --- hash.go | 86 ++++++++++++++++++++------------------ testdata/script/seed.txtar | 4 +- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/hash.go b/hash.go index 3473fd6..e45e3c0 100644 --- a/hash.go +++ b/hash.go @@ -177,7 +177,7 @@ var ( nameCharset = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_z" nameBase64 = base64.NewEncoding(nameCharset).WithPadding(base64.NoPadding) - b64NameBuffer [16]byte // nameBase64.EncodedLen(neededSumBytes) = 16 + b64NameBuffer [12]byte // nameBase64.EncodedLen(neededSumBytes) = 12 ) // These funcs mimic the unicode package API, but byte-based since we know @@ -210,6 +210,51 @@ func hashWithStruct(strct *types.Struct, fieldName string) string { return hashWithCustomSalt(fieldsSalt, fieldName) } +// minHashLength and maxHashLength define the range for the number of base64 +// characters to use for the final hashed name. +// +// minHashLength needs to be long enough to realistically avoid hash collisions, +// but maxHashLength should be short enough to not bloat binary sizes. +// The namespace for collisions is generally a single package, since +// that's where most hashed names are namespaced to. +// +// Using a "hash collision" formula, and taking a generous estimate of a +// package having 10k names, we get the following probabilities. +// Most packages will have far fewer names, but some packages are huge, +// especially generated ones. +// +// We also have slightly fewer bits in practice, since the base64 +// charset has 'z' twice, and the first base64 char is coerced into a +// valid Go identifier. So we must be conservative. +// Remember that base64 stores 6 bits per encoded byte. +// The probability numbers are approximated. +// +// length (base64) | length (bits) | collision probability +// ------------------------------------------------------- +// 4 24 ~95% +// 5 30 ~4% +// 6 36 ~0.07% +// 7 42 ~0.001% +// 8 48 ~0.00001% +// +// We want collisions to be practically impossible, so the hashed names end up +// with lengths evenly distributed between 6 and 12. Naively, this results in an +// average length of 9, which has a chance well below 1 in a million even when a +// package has thousands of obfuscated names. +// +// These numbers are also chosen to keep obfuscated binary sizes reasonable. +// For example, increasing the average length of 9 by 1 results in roughly a 1% +// increase in binary sizes. +const ( + minHashLength = 6 + maxHashLength = 12 + + // At most we'll need maxHashLength base64 characters, + // so 9 checksum bytes are enough for that purpose, + // which is nameBase64.DecodedLen(12) being rounded up. + neededSumBytes = 9 +) + // hashWithCustomSalt returns a hashed version of name, // including the provided salt as well as opts.Seed into the hash input. // @@ -224,45 +269,6 @@ func hashWithCustomSalt(salt []byte, name string) string { panic("hashWithCustomSalt: empty name") } - // minHashLength and maxHashLength define the range for the number of base64 - // characters to use for the final hashed name. - // - // minHashLength needs to be long enough to realistically avoid hash collisions, - // but maxHashLength should be short enough to not bloat binary sizes. - // The namespace for collisions is generally a single package, since - // that's where most hashed names are namespaced to. - // - // Using a "hash collision" formula, and taking a generous estimate of a - // package having 10k names, we get the following probabilities. - // Most packages will have far fewer names, but some packages are huge, - // especially generated ones. - // - // We also have slightly fewer bits in practice, since the base64 - // charset has 'z' twice, and the first base64 char is coerced into a - // valid Go identifier. So we must be conservative. - // Remember that base64 stores 6 bits per encoded byte. - // The probability numbers are approximated. - // - // length (base64) | length (bits) | collision probability - // ------------------------------------------------------- - // 4 24 ~95% - // 5 30 ~4% - // 6 36 ~0.07% - // 7 42 ~0.001% - // 8 48 ~0.00001% - // - // We want collisions to be practically impossible, so we choose 8 as - // minHashLength to end up with a chance of about 1 in a million even when a - // package has thousands of obfuscated names. - // - // In practice, the probability will be lower, as the lengths end up - // somewhere between minHashLength and maxHashLength. - const minHashLength = 8 - const maxHashLength = 15 - // At most we'll need maxHashLength (15) base64 characters, - // so 12 checksum bytes are enough for that purpose, rounding up. - const neededSumBytes = 12 - hasher.Reset() hasher.Write(salt) hasher.Write(flagSeed.bytes) diff --git a/testdata/script/seed.txtar b/testdata/script/seed.txtar index f77acf9..36dada9 100644 --- a/testdata/script/seed.txtar +++ b/testdata/script/seed.txtar @@ -132,10 +132,10 @@ func mainFunc() { var count [16]int for _, name := range hashedNames { name = name[len("main."):] - if len(name) < 8 { + if len(name) < 6 { panic("ended up with a hashed name that's too short: "+name) } - if len(name) > 15 { + if len(name) > 12 { panic("ended up with a hashed name that's too long: "+name) } count[len(name)]++