From 86b7a5d1430db40fd110743bcd96bac7da0f31e6 Mon Sep 17 00:00:00 2001 From: pxi Date: Sat, 30 Dec 2023 22:35:09 +0100 Subject: [PATCH] =?UTF-8?q?=C2=AF\=5F(=E3=83=84)=5F/=C2=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + README.md | 31 +++++- bash-fast-replace/Makefile | 10 ++ bash-fast-replace/README.md | 43 ++++++++ bash-fast-replace/bin/corpus.sh | 36 +++++++ bash-fast-replace/bin/pairs.sh | 28 +++++ bash-fast-replace/bin/replace.sh | 27 +++++ bash-fast-replace/bin/test.sh | 19 ++++ bash-fast-replace/csv/.gitkeep | 0 go-fast-replace/Makefile | 13 +++ go-fast-replace/README.md | 54 ++++++++++ go-fast-replace/cmd/corpus/main.go | 25 +++++ go-fast-replace/cmd/pairs/main.go | 25 +++++ go-fast-replace/cmd/replace/main.go | 107 ++++++++++++++++++ go-fast-replace/csv/.gitkeep | 0 go-fast-replace/go.mod | 10 ++ go-fast-replace/internal/corpus/corpus.go | 45 ++++++++ go-fast-replace/internal/corpus/go.mod | 3 + go-fast-replace/internal/pairs/go.mod | 3 + go-fast-replace/internal/pairs/pairs.go | 31 ++++++ go-fast-replace/internal/replace/bm.go | 11 ++ go-fast-replace/internal/replace/bm_test.go | 18 ++++ go-fast-replace/internal/replace/cw.go | 113 ++++++++++++++++++++ go-fast-replace/internal/replace/go.mod | 3 + 24 files changed, 655 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 bash-fast-replace/Makefile create mode 100644 bash-fast-replace/README.md create mode 100755 bash-fast-replace/bin/corpus.sh create mode 100755 bash-fast-replace/bin/pairs.sh create mode 100755 bash-fast-replace/bin/replace.sh create mode 100755 bash-fast-replace/bin/test.sh create mode 100644 bash-fast-replace/csv/.gitkeep create mode 100644 go-fast-replace/Makefile create mode 100644 go-fast-replace/README.md create mode 100644 go-fast-replace/cmd/corpus/main.go create mode 100644 go-fast-replace/cmd/pairs/main.go create mode 100644 go-fast-replace/cmd/replace/main.go create mode 100644 go-fast-replace/csv/.gitkeep create mode 100644 go-fast-replace/go.mod create mode 100644 go-fast-replace/internal/corpus/corpus.go create mode 100644 go-fast-replace/internal/corpus/go.mod create mode 100644 go-fast-replace/internal/pairs/go.mod create mode 100644 go-fast-replace/internal/pairs/pairs.go create mode 100644 go-fast-replace/internal/replace/bm.go create mode 100644 go-fast-replace/internal/replace/bm_test.go create mode 100644 go-fast-replace/internal/replace/cw.go create mode 100644 go-fast-replace/internal/replace/go.mod diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1cb768d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.csv +go-fast-replace/bin/* diff --git a/README.md b/README.md index 607025c..59d8c0f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,30 @@ -# fast-replace +# Fast Replace -This is a testing repo to try and find out what is the fastest way to search and replace a test file. +This is a testing repo to try and find out what is the fastest way to search and replace a test file. + +## The rules are simple + +All tests must contain 3 programs: + +### A key value generator (called pairs) + +Generate a key/value list where the key is the search value and the value is the replace value. + +Call the generated file `pairs.csv`. + +### A random file generator (called corpus) + +A program that can generate a file with random data mixed with the keys of your previous set. + +Call the random ordered file `corpus.csv`. + +### A random order replacer (called replace) + +- Load the `pairs.csv` file and `corpus.csv`. +- Next start a benchmark. +- Perform replacements and output to a file called `replaced.csv`. +- Stop the benchamark. + +## Conclusions + +Add a `README.md` to the base of your test file to explain the usage of your programs and expose your conclusions. diff --git a/bash-fast-replace/Makefile b/bash-fast-replace/Makefile new file mode 100644 index 0000000..9fe4241 --- /dev/null +++ b/bash-fast-replace/Makefile @@ -0,0 +1,10 @@ + +install: + ./bin/pairs.sh + ./bin/corpus.sh + +run: + ./bin/replace.sh + +test: + ./bin/test.sh diff --git a/bash-fast-replace/README.md b/bash-fast-replace/README.md new file mode 100644 index 0000000..59bef80 --- /dev/null +++ b/bash-fast-replace/README.md @@ -0,0 +1,43 @@ +# Bash Fast Replace + +An implementation in bash. + +## Usage + +To create the corpus.csv and pairs.csv files. And perform the replacements: + +```bash +make install # create corpus and pairs +make run # create file with the replaced values +``` + +Then to test: + +```bash +make test # Checks that the replacement worked +``` + + +## Conclusions + +`shuf` is amazing. Using /dev/urandom directly produces broken pipe errors on when using Make. + +Speed is not great... But no surprize there. +````bash +$ make install +./bin/pairs.sh +Generating csv/pairs.csv... +The generation of 1000 pairs took 2s +./bin/corpus.sh +Generating csv/corpus.csv... + generated in 3s + +$ make run +./bin/replace.sh +Generating replaced.csv... +Replacement took 3s + +$ make test +./bin/test.sh +test OK +```` \ No newline at end of file diff --git a/bash-fast-replace/bin/corpus.sh b/bash-fast-replace/bin/corpus.sh new file mode 100755 index 0000000..6a9fbd5 --- /dev/null +++ b/bash-fast-replace/bin/corpus.sh @@ -0,0 +1,36 @@ +#! /bin/bash + +# Makes a file with a number of key values + +START=$(date +%s) + +BASE=$(dirname $(dirname $(realpath -s $0))); +AMOUNT=1000 +FILE_NAME="corpus.csv" +FULL_PATH="$BASE/csv/$FILE_NAME" + +if [ -f "$FULL_PATH" ]; then + rm "$FULL_PATH" +fi + +echo "Generating csv/$FILE_NAME..." + +KEY_LEN=${#AMOUNT} +LINE_LEN=`expr 100 - $KEY_LEN` +for (( i = 0; i < "$AMOUNT"; i++ )); do + KEY=$(printf "%0${KEY_LEN}d\n" $i) + + LEN1=$(shuf -i "0-$LINE_LEN" -n 1) + LEN2=`expr 100 - $LEN1` + + PART1=$(shuf -er -n "$LEN1" {a..z} | paste -sd "") + PART2=$(shuf -er -n "$LEN2" {a..z} | paste -sd "") + + echo "$PART1$KEY$PART2" >> $FULL_PATH +done + +shuf "$FULL_PATH" > "$BASE/tmp.csv" && mv "$BASE/tmp.csv" "$FULL_PATH" + +END=$(date +%s) +SECONDS=`expr $END - $START` +echo "$FILE generated in ${SECONDS}s" diff --git a/bash-fast-replace/bin/pairs.sh b/bash-fast-replace/bin/pairs.sh new file mode 100755 index 0000000..6c45034 --- /dev/null +++ b/bash-fast-replace/bin/pairs.sh @@ -0,0 +1,28 @@ +#! /bin/bash + +# Makes a file with a number of key values + +START=$(date +%s) + +AMOUNT=1000 +BASE=$(dirname $(dirname $(realpath -s $0))); +FILE_NAME="pairs.csv" +FULL_PATH="$BASE/csv/$FILE_NAME" + +if [ -f "$FULL_PATH" ]; then + rm "$FULL_PATH" +fi + +echo "Generating csv/$FILE_NAME..." + +KEY_LEN=${#AMOUNT} +for (( i = 0; i < "$AMOUNT"; i++ )); do + KEY=$(printf "%0${KEY_LEN}d\n" $i) + VAL=$(shuf -er -n "$KEY_LEN" {A..Z} | paste -sd "") + + echo "$KEY,$VAL" >> "$FULL_PATH" +done + +END=$(date +%s) +SECONDS=`expr $END - $START` +echo "The generation of $AMOUNT pairs took ${SECONDS}s" \ No newline at end of file diff --git a/bash-fast-replace/bin/replace.sh b/bash-fast-replace/bin/replace.sh new file mode 100755 index 0000000..5f2d727 --- /dev/null +++ b/bash-fast-replace/bin/replace.sh @@ -0,0 +1,27 @@ +#! /bin/bash + +# Do search and replace for large files + +START=$(date +%s) +BASE=$(dirname $(dirname $(realpath -s $0))); + +CORPUS_FILE="$BASE/csv/corpus.csv" +PAIRS_FILE="$BASE/csv/pairs.csv" +REPLACED_FILE="$BASE/csv/replaced.csv" + +if [ -f "$REPLACED_FILE" ]; then + rm "$REPLACED_FILE" +fi + +echo "Generating replaced.csv..." + +cp "$CORPUS_FILE" "$REPLACED_FILE" + +while IFS=, read -r KEY VAL +do + sed -i -e "s/$KEY/$VAL/g" "$REPLACED_FILE" +done < "$PAIRS_FILE" + +END=$(date +%s) +SECONDS=`expr $END - $START` +echo "Replacement took ${SECONDS}s" diff --git a/bash-fast-replace/bin/test.sh b/bash-fast-replace/bin/test.sh new file mode 100755 index 0000000..0ab6028 --- /dev/null +++ b/bash-fast-replace/bin/test.sh @@ -0,0 +1,19 @@ +#! /bin/bash + +# Tests a file called replaced.csv +BASE=$(dirname $(dirname $(realpath -s $0))); +REPLACED_FILE="$BASE/csv/replaced.csv" + +if [ ! -f "$REPLACED_FILE" ]; then + echo "File not found: $REPLACED_FILE" + echo "Generate replacements first." + exit 1; +fi + +if [[ `cat "$REPLACED_FILE"` =~ [0-9] ]]; then + echo 'test NOK' + exit 1 +else + echo 'test OK' + exit 0 +fi diff --git a/bash-fast-replace/csv/.gitkeep b/bash-fast-replace/csv/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/go-fast-replace/Makefile b/go-fast-replace/Makefile new file mode 100644 index 0000000..240cccb --- /dev/null +++ b/go-fast-replace/Makefile @@ -0,0 +1,13 @@ + +build: + go build -o bin/pairs cmd/pairs/main.go + go build -o bin/corpus cmd/corpus/main.go + go build -o bin/replace cmd/replace/main.go + +run: + ./bin/corpus + ./bin/pairs + ./bin/replace + +test: + ./bin/test.sh diff --git a/go-fast-replace/README.md b/go-fast-replace/README.md new file mode 100644 index 0000000..ea71b10 --- /dev/null +++ b/go-fast-replace/README.md @@ -0,0 +1,54 @@ +# Go Fast Replace + +An implementation in Go. + +## To build + +````bash +make build +```` + +## To run + +````bash +make run +```` + +## To test + +````bash +make test +```` + +## Conclusions + +### Replace + +Current implementations: + +- `bm` uses `strings.NewReplacer` which uses the [Boyer–Moore](https://go.dev/src/strings/search.go) algorithm. +- `cw` uses [Commentz–Walter](https://en.wikipedia.org/wiki/Commentz-Walter_algorithm) algorithm. + +#### Results + +- `bm` varies a lot, but is never over _460µs_. +- `cw` is a lot slower, at around _3.5ms_, but it can search for multiple patterns at once. + +````shell +$ make run +./bin/corpus +Creating corpus... +./bin/pairs +Creating pairs... +./bin/replace + +bm: 273.621µs + +cw: 2.939254ms +```` + +#### Todo: +- Implement parallelism on the Commentz–Walter implementation. +- [Boyer–Moore variants](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#Variants) +- [Rabin–Karp](https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm) algorithm. +- [Aho–Corasick](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) algorithm. \ No newline at end of file diff --git a/go-fast-replace/cmd/corpus/main.go b/go-fast-replace/cmd/corpus/main.go new file mode 100644 index 0000000..27d9f9e --- /dev/null +++ b/go-fast-replace/cmd/corpus/main.go @@ -0,0 +1,25 @@ +package main + +import ( + "fmt" + "log" + "os" + "internal/corpus" +) + +func main() { + fmt.Println("Creating corpus...") + f, err := os.OpenFile("./csv/corpus.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) + if err != nil { + log.Fatal(err) + } + + defer f.Close() + + lines := corpus.MakeCorpus() + + // Write to file + for _, line := range lines { + f.WriteString(line + "\n") + } +} \ No newline at end of file diff --git a/go-fast-replace/cmd/pairs/main.go b/go-fast-replace/cmd/pairs/main.go new file mode 100644 index 0000000..26667cc --- /dev/null +++ b/go-fast-replace/cmd/pairs/main.go @@ -0,0 +1,25 @@ +package main + +import ( + "fmt" + "log" + "os" + "internal/pairs" +) + +func main() { + fmt.Println("Creating pairs...") + f, err := os.OpenFile("./csv/pairs.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) + if err != nil { + log.Fatal(err) + } + + defer f.Close() + + lines := pairs.MakePairs() + + // Write to file + for _, line := range lines { + f.WriteString(line + "\n") + } +} \ No newline at end of file diff --git a/go-fast-replace/cmd/replace/main.go b/go-fast-replace/cmd/replace/main.go new file mode 100644 index 0000000..0aff1e7 --- /dev/null +++ b/go-fast-replace/cmd/replace/main.go @@ -0,0 +1,107 @@ +package main + +import ( + "encoding/csv" + "errors" + "fmt" + "log" + "os" + "time" + "internal/replace" +) + +func main() { + run, err := getArgs() + if err != nil { + log.Fatal(err) + return + } + + corpusBytes, err := os.ReadFile("./csv/corpus.csv") + if err != nil { + log.Fatal(err) + return + } + + // Read the CSV file containing replacement key-value pairs + pairsHandle, err := os.Open("./csv/pairs.csv") + if err != nil { + fmt.Println("Error opening pairs:", err) + return + } + defer pairsHandle.Close() + + replacements, err := csv.NewReader(pairsHandle).ReadAll() + if err != nil { + fmt.Println("Error reading pairs:", err) + return + } + + corpus := string(corpusBytes) + + var pairs []string + var replaced string + var startTime time.Time + var elapsedTime time.Duration + for _, algo := range run { + switch algo { + case "cw": + // Prepare + cw := replace.NewCommentzWalter() + for _, record := range replacements { + cw.AddPattern(record[0], record[1]) + } + cw.BuildMatchingMachine() + + // Replace + startTime = time.Now() + replaced = cw.CommentzWalterReplace(corpus) + elapsedTime = time.Since(startTime) + case "bm": + // Prepare... + for _, record := range replacements { + pairs = append(pairs, record[0], record[1]) + } + + // Replace + startTime = time.Now() + replaced = replace.BoyerMooreReplace(corpus, pairs) + elapsedTime = time.Since(startTime) + } + fmt.Printf("\n%s: %s\n", algo, elapsedTime) + } + + // Write replaced text to a file + outputHandle, err := os.OpenFile("./csv/replaced.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) + if err != nil { + fmt.Println("Error creating output file:", err) + return + } + defer outputHandle.Close() + + // Write replaced text to the file + _, err = outputHandle.WriteString(replaced) + if err != nil { + fmt.Println("Error writing to file:", err) + return + } +} + +func getArgs() ([]string, error) { + var run = []string{"bm", "cw"} + + if len(os.Args) > 1 { + switch os.Args[1] { + case "all": + run = []string{"bm", "cw"} + case "cw": + run = []string{"cw"} + case "bm": + run = []string{"bm"} + default: + return []string{}, errors.New("Invalid argument.") + } + } + + return run, nil +} diff --git a/go-fast-replace/csv/.gitkeep b/go-fast-replace/csv/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/go-fast-replace/go.mod b/go-fast-replace/go.mod new file mode 100644 index 0000000..b5ec0d3 --- /dev/null +++ b/go-fast-replace/go.mod @@ -0,0 +1,10 @@ +module fast-replace + +go 1.21.5 + +require internal/corpus v1.0.0 +replace internal/corpus => ./internal/corpus +require internal/pairs v1.0.0 +replace internal/pairs => ./internal/pairs +require internal/replace v1.0.0 +replace internal/replace => ./internal/replace diff --git a/go-fast-replace/internal/corpus/corpus.go b/go-fast-replace/internal/corpus/corpus.go new file mode 100644 index 0000000..761f955 --- /dev/null +++ b/go-fast-replace/internal/corpus/corpus.go @@ -0,0 +1,45 @@ +package corpus + +import ( + "fmt" + "math/rand" +) + +func MakeCorpus() []string { + // Generate data + lineLength := 100 + var lines []string + for i := 1; i <= 1000; i++ { + lines = append(lines, makeLine(i, lineLength)) + } + + // Randomize + for i := range lines { + j := rand.Intn(i + 1) + lines[i], lines[j] = lines[j], lines[i] + } + + return lines +} + +func makeLine(line int, lineLength int) string { + var key string = fmt.Sprintf("%06d", line) + lineLength = lineLength - len(key) + + var firstLen int = rand.Intn(lineLength) + var secondLen int = lineLength - firstLen + + return randomString(firstLen) + key + randomString(secondLen) +} + +func randomString(n int) string { + var letters = []rune("abcdefghijklmnopqrstuvwxyz") + var length int = len(letters) + + s := make([]rune, n) + for i := range s { + s[i] = letters[rand.Intn(length)] + } + + return string(s) +} diff --git a/go-fast-replace/internal/corpus/go.mod b/go-fast-replace/internal/corpus/go.mod new file mode 100644 index 0000000..a90e8b0 --- /dev/null +++ b/go-fast-replace/internal/corpus/go.mod @@ -0,0 +1,3 @@ +module corpus + +go 1.21.5 diff --git a/go-fast-replace/internal/pairs/go.mod b/go-fast-replace/internal/pairs/go.mod new file mode 100644 index 0000000..b625a25 --- /dev/null +++ b/go-fast-replace/internal/pairs/go.mod @@ -0,0 +1,3 @@ +module pairs + +go 1.21.5 diff --git a/go-fast-replace/internal/pairs/pairs.go b/go-fast-replace/internal/pairs/pairs.go new file mode 100644 index 0000000..be50c13 --- /dev/null +++ b/go-fast-replace/internal/pairs/pairs.go @@ -0,0 +1,31 @@ +package pairs + +import ( + "fmt" + "math/rand" +) + +func MakePairs() []string { + var pairs []string + for i := 1; i <= 1000; i++ { + pairs = append(pairs, makeKey(i) + "," + makeValue()) + } + + return pairs +} + +func makeKey(i int) string { + return fmt.Sprintf("%06d", i) +} + +func makeValue() string { + var letters = []rune("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + var length int = len(letters) + + s := make([]rune, 6) + for i := range s { + s[i] = letters[rand.Intn(length)] + } + + return string(s) +} \ No newline at end of file diff --git a/go-fast-replace/internal/replace/bm.go b/go-fast-replace/internal/replace/bm.go new file mode 100644 index 0000000..efa6a6d --- /dev/null +++ b/go-fast-replace/internal/replace/bm.go @@ -0,0 +1,11 @@ +package replace + +import "strings" + +// Uses strings.NewReplacer (which implements Boyer-Moore search) +// See https://go.dev/src/strings/search.go +func BoyerMooreReplace(corpus string, pairs []string) string { + replacer := strings.NewReplacer(pairs...) + + return replacer.Replace(corpus) +} diff --git a/go-fast-replace/internal/replace/bm_test.go b/go-fast-replace/internal/replace/bm_test.go new file mode 100644 index 0000000..d14bf4d --- /dev/null +++ b/go-fast-replace/internal/replace/bm_test.go @@ -0,0 +1,18 @@ +package replace + +import ( + "testing" + "internal/replace" +) + +func TestBoyerMooreReplace(t *testing.T) { + var corpus string = "abcdefghi012345jklmnopqrst" + var pairs = []string{"012345", "ABCDEF"} + + var want string = "abcdefghiABCDEFjklmnopqrst" + got := replace.BoyerMooreReplace(corpus, pairs) + + if want != got { + t.Error("Replacement was wrong.") + } +} \ No newline at end of file diff --git a/go-fast-replace/internal/replace/cw.go b/go-fast-replace/internal/replace/cw.go new file mode 100644 index 0000000..ed33d12 --- /dev/null +++ b/go-fast-replace/internal/replace/cw.go @@ -0,0 +1,113 @@ +package replace + +type Node struct { + children map[rune]*Node + output int + fail *Node +} + +func NewNode() *Node { + return &Node{ + children: make(map[rune]*Node), + output: -1, + fail: nil, + } +} + +type CommentzWalter struct { + root *Node + patterns []string + replacements map[int]string +} + +func NewCommentzWalter() *CommentzWalter { + return &CommentzWalter{ + root: NewNode(), + patterns: make([]string, 0), + replacements: make(map[int]string), + } +} + +func (cw *CommentzWalter) AddPattern(pattern string, replacement string) { + cw.patterns = append(cw.patterns, pattern) + idx := len(cw.patterns) - 1 + cw.replacements[idx] = replacement +} + +func (cw *CommentzWalter) BuildMatchingMachine() { + for idx, pattern := range cw.patterns { + current := cw.root + for _, char := range pattern { + if _, exists := current.children[char]; !exists { + current.children[char] = NewNode() + } + current = current.children[char] + } + current.output = idx + } + + queue := make([]*Node, 0) + for _, node := range cw.root.children { + queue = append(queue, node) + node.fail = cw.root + } + + for len(queue) > 0 { + r := queue[0] + queue = queue[1:] + + for char, child := range r.children { + queue = append(queue, child) + failNode := r.fail + for failNode != nil && failNode.children[char] == nil { + failNode = failNode.fail + } + + if failNode == nil { + child.fail = cw.root + } else { + child.fail = failNode.children[char] + } + } + } +} + +func (cw *CommentzWalter) CommentzWalterReplace(text string) string { + matches := make([]string, 0) + current := cw.root + runes := []rune(text) + replaced := make([]rune, 0) + + for i, char := range runes { + for current != nil && current.children[char] == nil { + current = current.fail + } + + if current == nil { + current = cw.root + replaced = append(replaced, runes[i]) + continue + } + + current = current.children[char] + if current.output != -1 { + matchIdx := current.output + matches = append(matches, cw.replacements[matchIdx]) + replRunes := []rune(cw.replacements[matchIdx]) + + for _, rn := range replRunes { + replaced = append(replaced, rn) + } + + patternLen := len(cw.patterns[matchIdx]) + replaceLen := len(cw.replacements[matchIdx]) + if (patternLen > replaceLen) { + // Skip index to after the pattern + dif := patternLen - replaceLen + i = i + dif + } + } + } + + return string(replaced) +} diff --git a/go-fast-replace/internal/replace/go.mod b/go-fast-replace/internal/replace/go.mod new file mode 100644 index 0000000..609e3fd --- /dev/null +++ b/go-fast-replace/internal/replace/go.mod @@ -0,0 +1,3 @@ +module replace + +go 1.21.5