¯\_(ツ)_/¯

This commit is contained in:
pxi 2023-12-30 22:35:09 +01:00
parent 98462670f1
commit 86b7a5d143
24 changed files with 655 additions and 2 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.csv
go-fast-replace/bin/*

View File

@ -1,3 +1,30 @@
# fast-replace
# Fast Replace
This is a testing repo to try and find out what is the fastest way to search and replace a test file.
This is a testing repo to try and find out what is the fastest way to search and replace a test file.
## The rules are simple
All tests must contain 3 programs:
### A key value generator (called pairs)
Generate a key/value list where the key is the search value and the value is the replace value.
Call the generated file `pairs.csv`.
### A random file generator (called corpus)
A program that can generate a file with random data mixed with the keys of your previous set.
Call the random ordered file `corpus.csv`.
### A random order replacer (called replace)
- Load the `pairs.csv` file and `corpus.csv`.
- Next start a benchmark.
- Perform replacements and output to a file called `replaced.csv`.
- Stop the benchamark.
## Conclusions
Add a `README.md` to the base of your test file to explain the usage of your programs and expose your conclusions.

View File

@ -0,0 +1,10 @@
install:
./bin/pairs.sh
./bin/corpus.sh
run:
./bin/replace.sh
test:
./bin/test.sh

View File

@ -0,0 +1,43 @@
# Bash Fast Replace
An implementation in bash.
## Usage
To create the corpus.csv and pairs.csv files. And perform the replacements:
```bash
make install # create corpus and pairs
make run # create file with the replaced values
```
Then to test:
```bash
make test # Checks that the replacement worked
```
## Conclusions
`shuf` is amazing. Using /dev/urandom directly produces broken pipe errors on when using Make.
Speed is not great... But no surprize there.
````bash
$ make install
./bin/pairs.sh
Generating csv/pairs.csv...
The generation of 1000 pairs took 2s
./bin/corpus.sh
Generating csv/corpus.csv...
generated in 3s
$ make run
./bin/replace.sh
Generating replaced.csv...
Replacement took 3s
$ make test
./bin/test.sh
test OK
````

36
bash-fast-replace/bin/corpus.sh Executable file
View File

@ -0,0 +1,36 @@
#! /bin/bash
# Makes a file with a number of key values
START=$(date +%s)
BASE=$(dirname $(dirname $(realpath -s $0)));
AMOUNT=1000
FILE_NAME="corpus.csv"
FULL_PATH="$BASE/csv/$FILE_NAME"
if [ -f "$FULL_PATH" ]; then
rm "$FULL_PATH"
fi
echo "Generating csv/$FILE_NAME..."
KEY_LEN=${#AMOUNT}
LINE_LEN=`expr 100 - $KEY_LEN`
for (( i = 0; i < "$AMOUNT"; i++ )); do
KEY=$(printf "%0${KEY_LEN}d\n" $i)
LEN1=$(shuf -i "0-$LINE_LEN" -n 1)
LEN2=`expr 100 - $LEN1`
PART1=$(shuf -er -n "$LEN1" {a..z} | paste -sd "")
PART2=$(shuf -er -n "$LEN2" {a..z} | paste -sd "")
echo "$PART1$KEY$PART2" >> $FULL_PATH
done
shuf "$FULL_PATH" > "$BASE/tmp.csv" && mv "$BASE/tmp.csv" "$FULL_PATH"
END=$(date +%s)
SECONDS=`expr $END - $START`
echo "$FILE generated in ${SECONDS}s"

28
bash-fast-replace/bin/pairs.sh Executable file
View File

@ -0,0 +1,28 @@
#! /bin/bash
# Makes a file with a number of key values
START=$(date +%s)
AMOUNT=1000
BASE=$(dirname $(dirname $(realpath -s $0)));
FILE_NAME="pairs.csv"
FULL_PATH="$BASE/csv/$FILE_NAME"
if [ -f "$FULL_PATH" ]; then
rm "$FULL_PATH"
fi
echo "Generating csv/$FILE_NAME..."
KEY_LEN=${#AMOUNT}
for (( i = 0; i < "$AMOUNT"; i++ )); do
KEY=$(printf "%0${KEY_LEN}d\n" $i)
VAL=$(shuf -er -n "$KEY_LEN" {A..Z} | paste -sd "")
echo "$KEY,$VAL" >> "$FULL_PATH"
done
END=$(date +%s)
SECONDS=`expr $END - $START`
echo "The generation of $AMOUNT pairs took ${SECONDS}s"

View File

@ -0,0 +1,27 @@
#! /bin/bash
# Do search and replace for large files
START=$(date +%s)
BASE=$(dirname $(dirname $(realpath -s $0)));
CORPUS_FILE="$BASE/csv/corpus.csv"
PAIRS_FILE="$BASE/csv/pairs.csv"
REPLACED_FILE="$BASE/csv/replaced.csv"
if [ -f "$REPLACED_FILE" ]; then
rm "$REPLACED_FILE"
fi
echo "Generating replaced.csv..."
cp "$CORPUS_FILE" "$REPLACED_FILE"
while IFS=, read -r KEY VAL
do
sed -i -e "s/$KEY/$VAL/g" "$REPLACED_FILE"
done < "$PAIRS_FILE"
END=$(date +%s)
SECONDS=`expr $END - $START`
echo "Replacement took ${SECONDS}s"

19
bash-fast-replace/bin/test.sh Executable file
View File

@ -0,0 +1,19 @@
#! /bin/bash
# Tests a file called replaced.csv
BASE=$(dirname $(dirname $(realpath -s $0)));
REPLACED_FILE="$BASE/csv/replaced.csv"
if [ ! -f "$REPLACED_FILE" ]; then
echo "File not found: $REPLACED_FILE"
echo "Generate replacements first."
exit 1;
fi
if [[ `cat "$REPLACED_FILE"` =~ [0-9] ]]; then
echo 'test NOK'
exit 1
else
echo 'test OK'
exit 0
fi

View File

13
go-fast-replace/Makefile Normal file
View File

@ -0,0 +1,13 @@
build:
go build -o bin/pairs cmd/pairs/main.go
go build -o bin/corpus cmd/corpus/main.go
go build -o bin/replace cmd/replace/main.go
run:
./bin/corpus
./bin/pairs
./bin/replace
test:
./bin/test.sh

54
go-fast-replace/README.md Normal file
View File

@ -0,0 +1,54 @@
# Go Fast Replace
An implementation in Go.
## To build
````bash
make build
````
## To run
````bash
make run
````
## To test
````bash
make test
````
## Conclusions
### Replace
Current implementations:
- `bm` uses `strings.NewReplacer` which uses the [BoyerMoore](https://go.dev/src/strings/search.go) algorithm.
- `cw` uses [CommentzWalter](https://en.wikipedia.org/wiki/Commentz-Walter_algorithm) algorithm.
#### Results
- `bm` varies a lot, but is never over _460µs_.
- `cw` is a lot slower, at around _3.5ms_, but it can search for multiple patterns at once.
````shell
$ make run
./bin/corpus
Creating corpus...
./bin/pairs
Creating pairs...
./bin/replace
bm: 273.621µs
cw: 2.939254ms
````
#### Todo:
- Implement parallelism on the CommentzWalter implementation.
- [BoyerMoore variants](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#Variants)
- [RabinKarp](https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm) algorithm.
- [AhoCorasick](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) algorithm.

View File

@ -0,0 +1,25 @@
package main
import (
"fmt"
"log"
"os"
"internal/corpus"
)
func main() {
fmt.Println("Creating corpus...")
f, err := os.OpenFile("./csv/corpus.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
if err != nil {
log.Fatal(err)
}
defer f.Close()
lines := corpus.MakeCorpus()
// Write to file
for _, line := range lines {
f.WriteString(line + "\n")
}
}

View File

@ -0,0 +1,25 @@
package main
import (
"fmt"
"log"
"os"
"internal/pairs"
)
func main() {
fmt.Println("Creating pairs...")
f, err := os.OpenFile("./csv/pairs.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
if err != nil {
log.Fatal(err)
}
defer f.Close()
lines := pairs.MakePairs()
// Write to file
for _, line := range lines {
f.WriteString(line + "\n")
}
}

View File

@ -0,0 +1,107 @@
package main
import (
"encoding/csv"
"errors"
"fmt"
"log"
"os"
"time"
"internal/replace"
)
func main() {
run, err := getArgs()
if err != nil {
log.Fatal(err)
return
}
corpusBytes, err := os.ReadFile("./csv/corpus.csv")
if err != nil {
log.Fatal(err)
return
}
// Read the CSV file containing replacement key-value pairs
pairsHandle, err := os.Open("./csv/pairs.csv")
if err != nil {
fmt.Println("Error opening pairs:", err)
return
}
defer pairsHandle.Close()
replacements, err := csv.NewReader(pairsHandle).ReadAll()
if err != nil {
fmt.Println("Error reading pairs:", err)
return
}
corpus := string(corpusBytes)
var pairs []string
var replaced string
var startTime time.Time
var elapsedTime time.Duration
for _, algo := range run {
switch algo {
case "cw":
// Prepare
cw := replace.NewCommentzWalter()
for _, record := range replacements {
cw.AddPattern(record[0], record[1])
}
cw.BuildMatchingMachine()
// Replace
startTime = time.Now()
replaced = cw.CommentzWalterReplace(corpus)
elapsedTime = time.Since(startTime)
case "bm":
// Prepare...
for _, record := range replacements {
pairs = append(pairs, record[0], record[1])
}
// Replace
startTime = time.Now()
replaced = replace.BoyerMooreReplace(corpus, pairs)
elapsedTime = time.Since(startTime)
}
fmt.Printf("\n%s: %s\n", algo, elapsedTime)
}
// Write replaced text to a file
outputHandle, err := os.OpenFile("./csv/replaced.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
if err != nil {
fmt.Println("Error creating output file:", err)
return
}
defer outputHandle.Close()
// Write replaced text to the file
_, err = outputHandle.WriteString(replaced)
if err != nil {
fmt.Println("Error writing to file:", err)
return
}
}
func getArgs() ([]string, error) {
var run = []string{"bm", "cw"}
if len(os.Args) > 1 {
switch os.Args[1] {
case "all":
run = []string{"bm", "cw"}
case "cw":
run = []string{"cw"}
case "bm":
run = []string{"bm"}
default:
return []string{}, errors.New("Invalid argument.")
}
}
return run, nil
}

View File

10
go-fast-replace/go.mod Normal file
View File

@ -0,0 +1,10 @@
module fast-replace
go 1.21.5
require internal/corpus v1.0.0
replace internal/corpus => ./internal/corpus
require internal/pairs v1.0.0
replace internal/pairs => ./internal/pairs
require internal/replace v1.0.0
replace internal/replace => ./internal/replace

View File

@ -0,0 +1,45 @@
package corpus
import (
"fmt"
"math/rand"
)
func MakeCorpus() []string {
// Generate data
lineLength := 100
var lines []string
for i := 1; i <= 1000; i++ {
lines = append(lines, makeLine(i, lineLength))
}
// Randomize
for i := range lines {
j := rand.Intn(i + 1)
lines[i], lines[j] = lines[j], lines[i]
}
return lines
}
func makeLine(line int, lineLength int) string {
var key string = fmt.Sprintf("%06d", line)
lineLength = lineLength - len(key)
var firstLen int = rand.Intn(lineLength)
var secondLen int = lineLength - firstLen
return randomString(firstLen) + key + randomString(secondLen)
}
func randomString(n int) string {
var letters = []rune("abcdefghijklmnopqrstuvwxyz")
var length int = len(letters)
s := make([]rune, n)
for i := range s {
s[i] = letters[rand.Intn(length)]
}
return string(s)
}

View File

@ -0,0 +1,3 @@
module corpus
go 1.21.5

View File

@ -0,0 +1,3 @@
module pairs
go 1.21.5

View File

@ -0,0 +1,31 @@
package pairs
import (
"fmt"
"math/rand"
)
func MakePairs() []string {
var pairs []string
for i := 1; i <= 1000; i++ {
pairs = append(pairs, makeKey(i) + "," + makeValue())
}
return pairs
}
func makeKey(i int) string {
return fmt.Sprintf("%06d", i)
}
func makeValue() string {
var letters = []rune("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
var length int = len(letters)
s := make([]rune, 6)
for i := range s {
s[i] = letters[rand.Intn(length)]
}
return string(s)
}

View File

@ -0,0 +1,11 @@
package replace
import "strings"
// Uses strings.NewReplacer (which implements Boyer-Moore search)
// See https://go.dev/src/strings/search.go
func BoyerMooreReplace(corpus string, pairs []string) string {
replacer := strings.NewReplacer(pairs...)
return replacer.Replace(corpus)
}

View File

@ -0,0 +1,18 @@
package replace
import (
"testing"
"internal/replace"
)
func TestBoyerMooreReplace(t *testing.T) {
var corpus string = "abcdefghi012345jklmnopqrst"
var pairs = []string{"012345", "ABCDEF"}
var want string = "abcdefghiABCDEFjklmnopqrst"
got := replace.BoyerMooreReplace(corpus, pairs)
if want != got {
t.Error("Replacement was wrong.")
}
}

View File

@ -0,0 +1,113 @@
package replace
type Node struct {
children map[rune]*Node
output int
fail *Node
}
func NewNode() *Node {
return &Node{
children: make(map[rune]*Node),
output: -1,
fail: nil,
}
}
type CommentzWalter struct {
root *Node
patterns []string
replacements map[int]string
}
func NewCommentzWalter() *CommentzWalter {
return &CommentzWalter{
root: NewNode(),
patterns: make([]string, 0),
replacements: make(map[int]string),
}
}
func (cw *CommentzWalter) AddPattern(pattern string, replacement string) {
cw.patterns = append(cw.patterns, pattern)
idx := len(cw.patterns) - 1
cw.replacements[idx] = replacement
}
func (cw *CommentzWalter) BuildMatchingMachine() {
for idx, pattern := range cw.patterns {
current := cw.root
for _, char := range pattern {
if _, exists := current.children[char]; !exists {
current.children[char] = NewNode()
}
current = current.children[char]
}
current.output = idx
}
queue := make([]*Node, 0)
for _, node := range cw.root.children {
queue = append(queue, node)
node.fail = cw.root
}
for len(queue) > 0 {
r := queue[0]
queue = queue[1:]
for char, child := range r.children {
queue = append(queue, child)
failNode := r.fail
for failNode != nil && failNode.children[char] == nil {
failNode = failNode.fail
}
if failNode == nil {
child.fail = cw.root
} else {
child.fail = failNode.children[char]
}
}
}
}
func (cw *CommentzWalter) CommentzWalterReplace(text string) string {
matches := make([]string, 0)
current := cw.root
runes := []rune(text)
replaced := make([]rune, 0)
for i, char := range runes {
for current != nil && current.children[char] == nil {
current = current.fail
}
if current == nil {
current = cw.root
replaced = append(replaced, runes[i])
continue
}
current = current.children[char]
if current.output != -1 {
matchIdx := current.output
matches = append(matches, cw.replacements[matchIdx])
replRunes := []rune(cw.replacements[matchIdx])
for _, rn := range replRunes {
replaced = append(replaced, rn)
}
patternLen := len(cw.patterns[matchIdx])
replaceLen := len(cw.replacements[matchIdx])
if (patternLen > replaceLen) {
// Skip index to after the pattern
dif := patternLen - replaceLen
i = i + dif
}
}
}
return string(replaced)
}

View File

@ -0,0 +1,3 @@
module replace
go 1.21.5