2
0
Эх сурвалжийг харах

Very notable performance gains with string caching performance and bingen decoding (#1367)

Signed-off-by: Matt Bolt <mbolt35@gmail.com>

Signed-off-by: Matt Bolt <mbolt35@gmail.com>
Matt Bolt 3 жил өмнө
parent
commit
c3fe30892b

+ 14 - 9
pkg/util/buffer.go

@@ -388,15 +388,20 @@ func intDataSize(data interface{}) int {
 
 // Conversion from byte slice to string
 func bytesToString(b []byte) string {
-	// The following commented code will map an existing byte slice's underlying array
-	// to a string. The purpose is to avoid new allocation of a string from an existing
-	// byte array. This is a solid optimization to save total allocations, but effectively
-	// pins the string's existence to the underlying byte array. This can prevent GC of the
-	// bytes in a few cases, and we should just return a newly allocated string to allow
-	// future optimization.
-	//return *(*string)(unsafe.Pointer(&b))
-
-	return stringutil.Bank(string(b))
+	// This code will take the passed byte slice and cast it in-place into a string. By doing
+	// this, we are pinning the byte slice's underlying array in memory, preventing it from
+	// being garbage collected while the string is still in use. If we are using the Bank()
+	// functionality to cache new strings, we risk keeping the pinned array alive. To avoid this,
+	// we will use the BankFunc() call which uses the casted string to check for existance of a
+	// cached string. If it exists, then we drop the pinned reference immediately and use the
+	// cached string. If it does _not_ exist, then we use the passed func() string to allocate a new
+	// string and cache it. This will prevent us from allocating throw-away strings just to
+	// check our cache.
+	pinned := *(*string)(unsafe.Pointer(&b))
+
+	return stringutil.BankFunc(pinned, func() string {
+		return string(b)
+	})
 }
 
 // Direct string to byte conversion that doesn't allocate.

+ 64 - 7
pkg/util/stringutil/stringutil.go

@@ -23,10 +23,54 @@ const (
 var alpha = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 var alphanumeric = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
 
-// Any strings created at runtime, duplicate or not, are copied, even though by specification,
-// a go string is immutable. This utility allows us to cache runtime strings and retrieve them
-// when we expect heavy duplicates.
-var strings sync.Map
+type stringBank struct {
+	lock sync.Mutex
+	m    map[string]string
+}
+
+func newStringBank() *stringBank {
+	return &stringBank{
+		m: make(map[string]string),
+	}
+}
+
+func (sb *stringBank) LoadOrStore(key, value string) (string, bool) {
+	sb.lock.Lock()
+
+	if v, ok := sb.m[key]; ok {
+		sb.lock.Unlock()
+		return v, ok
+	}
+
+	sb.m[key] = value
+	sb.lock.Unlock()
+	return value, false
+}
+
+func (sb *stringBank) LoadOrStoreFunc(key string, f func() string) (string, bool) {
+	sb.lock.Lock()
+
+	if v, ok := sb.m[key]; ok {
+		sb.lock.Unlock()
+		return v, ok
+	}
+
+	// create the key and value using the func (the key could be deallocated later)
+	value := f()
+	sb.m[value] = value
+	sb.lock.Unlock()
+	return value, false
+}
+
+func (sb *stringBank) Clear() {
+	sb.lock.Lock()
+	sb.m = make(map[string]string)
+	sb.lock.Unlock()
+}
+
+// stringBank is an unbounded string cache that is thread-safe. It is especially useful if
+// storing a large frequency of dynamically allocated duplicate strings.
+var strings = newStringBank() // sync.Map
 
 func init() {
 	rand.Seed(time.Now().UnixNano())
@@ -36,7 +80,18 @@ func init() {
 // the string as the unique instance.
 func Bank(s string) string {
 	ss, _ := strings.LoadOrStore(s, s)
-	return ss.(string)
+	return ss
+}
+
+// BankFunc will use the provided s string to check for an existing allocation of the string. However,
+// if no allocation exists, the f parameter will be used to create the string and store in the bank.
+func BankFunc(s string, f func() string) string {
+	ss, _ := strings.LoadOrStoreFunc(s, f)
+	return ss
+}
+
+func ClearBank() {
+	strings.Clear()
 }
 
 // RandSeq generates a pseudo-random alphabetic string of the given length
@@ -85,7 +140,8 @@ func StringSlicesEqual(left, right []string) bool {
 	// Build maps for each slice that counts each unique instance
 	leftMap := make(map[string]int, len(left))
 	for _, str := range left {
-		count, ok := leftMap[str]; if ok {
+		count, ok := leftMap[str]
+		if ok {
 			leftMap[str] = count + 1
 		} else {
 			leftMap[str] = 1
@@ -93,7 +149,8 @@ func StringSlicesEqual(left, right []string) bool {
 	}
 	rightMap := make(map[string]int, len(right))
 	for _, str := range right {
-		count, ok := rightMap[str]; if ok {
+		count, ok := rightMap[str]
+		if ok {
 			rightMap[str] = count + 1
 		} else {
 			rightMap[str] = 1

+ 144 - 0
pkg/util/stringutil/stringutil_test.go

@@ -0,0 +1,144 @@
+package stringutil
+
+import (
+	"fmt"
+	"math/rand"
+	"runtime"
+	"runtime/debug"
+	"sync"
+	"testing"
+)
+
+var oldBank sync.Map
+
+// This is the old implementation of the string bank to use for comparison benchmarks
+func BankLegacy(s string) string {
+	ss, _ := oldBank.LoadOrStore(s, s)
+	return ss.(string)
+}
+
+func ClearBankLegacy() {
+	oldBank = sync.Map{}
+}
+
+func copyString(s string) string {
+	return string([]byte(s))
+}
+
+func generateBenchData(totalStrings, totalUnique int) []string {
+	randStrings := make([]string, 0, totalStrings)
+
+	// create totalUnique unique strings
+	for i := 0; i < totalUnique; i++ {
+		randStrings = append(randStrings, fmt.Sprintf("%s/%s/%s", RandSeq(10), RandSeq(10), RandSeq(10)))
+	}
+
+	// set the seed such that the resulting "remainder" strings are deterministic for each bench
+	rand.Seed(1523942)
+
+	// append a random selection from 0-totalUnique to the list.
+	for i := 0; i < totalStrings-totalUnique; i++ {
+		randStrings = append(randStrings, copyString(randStrings[rand.Intn(totalUnique)]))
+	}
+
+	// shuffle the list of strings
+	rand.Shuffle(totalStrings, func(i, j int) { randStrings[i], randStrings[j] = randStrings[j], randStrings[i] })
+
+	return randStrings
+}
+
+func benchmarkLegacyStringBank(b *testing.B, totalStrings, totalUnique int) {
+	b.StopTimer()
+	randStrings := generateBenchData(totalStrings, totalUnique)
+
+	for i := 0; i < b.N; i++ {
+		b.StartTimer()
+		for b := 0; b < totalStrings; b++ {
+			BankLegacy(randStrings[b])
+		}
+		b.StopTimer()
+		ClearBankLegacy()
+		runtime.GC()
+		debug.FreeOSMemory()
+	}
+}
+
+func benchmarkStringBank(b *testing.B, totalStrings, totalUnique int, useBankFunc bool) {
+	b.StopTimer()
+	randStrings := generateBenchData(totalStrings, totalUnique)
+
+	for i := 0; i < b.N; i++ {
+		b.StartTimer()
+		for b := 0; b < totalStrings; b++ {
+			if useBankFunc {
+				BankFunc(randStrings[b], func() string { return randStrings[b] })
+			} else {
+				Bank(randStrings[b])
+			}
+		}
+		b.StopTimer()
+		ClearBank()
+		runtime.GC()
+		debug.FreeOSMemory()
+	}
+}
+
+func BenchmarkLegacyStringBank90PercentDuplicate(b *testing.B) {
+	benchmarkLegacyStringBank(b, 1_000_000, 100_000)
+}
+
+func BenchmarkLegacyStringBank75PercentDuplicate(b *testing.B) {
+	benchmarkLegacyStringBank(b, 1_000_000, 250_000)
+}
+
+func BenchmarkLegacyStringBank50PercentDuplicate(b *testing.B) {
+	benchmarkLegacyStringBank(b, 1_000_000, 100_000)
+}
+
+func BenchmarkLegacyStringBank25PercentDuplicate(b *testing.B) {
+	benchmarkLegacyStringBank(b, 1_000_000, 750_000)
+}
+
+func BenchmarkLegacyStringBankNoDuplicate(b *testing.B) {
+	benchmarkLegacyStringBank(b, 1_000_000, 1_000_000)
+}
+
+func BenchmarkStringBank90PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 100_000, false)
+}
+
+func BenchmarkStringBank75PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 250_000, false)
+}
+
+func BenchmarkStringBank50PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 100_000, false)
+}
+
+func BenchmarkStringBank25PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 750_000, false)
+}
+
+func BenchmarkStringBankNoDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 1_000_000, false)
+}
+
+func BenchmarkStringBankFunc90PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 100_000, false)
+}
+
+func BenchmarkStringBankFunc75PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 250_000, false)
+}
+
+func BenchmarkStringBankFunc50PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 100_000, false)
+}
+
+func BenchmarkStringBankFunc25PercentDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 750_000, false)
+}
+
+func BenchmarkStringBankFuncNoDuplicate(b *testing.B) {
+	benchmarkStringBank(b, 1_000_000, 1_000_000, false)
+}