¯\_(ツ)_/¯
parent
98462670f1
commit
86b7a5d143
|
@ -0,0 +1,2 @@
|
|||
*.csv
|
||||
go-fast-replace/bin/*
|
29
README.md
29
README.md
|
@ -1,3 +1,30 @@
|
|||
# fast-replace
|
||||
# Fast Replace
|
||||
|
||||
This is a testing repo to try and find out what is the fastest way to search and replace a test file.
|
||||
|
||||
## The rules are simple
|
||||
|
||||
All tests must contain 3 programs:
|
||||
|
||||
### A key value generator (called pairs)
|
||||
|
||||
Generate a key/value list where the key is the search value and the value is the replace value.
|
||||
|
||||
Call the generated file `pairs.csv`.
|
||||
|
||||
### A random file generator (called corpus)
|
||||
|
||||
A program that can generate a file with random data mixed with the keys of your previous set.
|
||||
|
||||
Call the random ordered file `corpus.csv`.
|
||||
|
||||
### A random order replacer (called replace)
|
||||
|
||||
- Load the `pairs.csv` file and `corpus.csv`.
|
||||
- Next start a benchmark.
|
||||
- Perform replacements and output to a file called `replaced.csv`.
|
||||
- Stop the benchamark.
|
||||
|
||||
## Conclusions
|
||||
|
||||
Add a `README.md` to the base of your test file to explain the usage of your programs and expose your conclusions.
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
install:
|
||||
./bin/pairs.sh
|
||||
./bin/corpus.sh
|
||||
|
||||
run:
|
||||
./bin/replace.sh
|
||||
|
||||
test:
|
||||
./bin/test.sh
|
|
@ -0,0 +1,43 @@
|
|||
# Bash Fast Replace
|
||||
|
||||
An implementation in bash.
|
||||
|
||||
## Usage
|
||||
|
||||
To create the corpus.csv and pairs.csv files. And perform the replacements:
|
||||
|
||||
```bash
|
||||
make install # create corpus and pairs
|
||||
make run # create file with the replaced values
|
||||
```
|
||||
|
||||
Then to test:
|
||||
|
||||
```bash
|
||||
make test # Checks that the replacement worked
|
||||
```
|
||||
|
||||
|
||||
## Conclusions
|
||||
|
||||
`shuf` is amazing. Using /dev/urandom directly produces broken pipe errors on when using Make.
|
||||
|
||||
Speed is not great... But no surprize there.
|
||||
````bash
|
||||
$ make install
|
||||
./bin/pairs.sh
|
||||
Generating csv/pairs.csv...
|
||||
The generation of 1000 pairs took 2s
|
||||
./bin/corpus.sh
|
||||
Generating csv/corpus.csv...
|
||||
generated in 3s
|
||||
|
||||
$ make run
|
||||
./bin/replace.sh
|
||||
Generating replaced.csv...
|
||||
Replacement took 3s
|
||||
|
||||
$ make test
|
||||
./bin/test.sh
|
||||
test OK
|
||||
````
|
|
@ -0,0 +1,36 @@
|
|||
#! /bin/bash
|
||||
|
||||
# Makes a file with a number of key values
|
||||
|
||||
START=$(date +%s)
|
||||
|
||||
BASE=$(dirname $(dirname $(realpath -s $0)));
|
||||
AMOUNT=1000
|
||||
FILE_NAME="corpus.csv"
|
||||
FULL_PATH="$BASE/csv/$FILE_NAME"
|
||||
|
||||
if [ -f "$FULL_PATH" ]; then
|
||||
rm "$FULL_PATH"
|
||||
fi
|
||||
|
||||
echo "Generating csv/$FILE_NAME..."
|
||||
|
||||
KEY_LEN=${#AMOUNT}
|
||||
LINE_LEN=`expr 100 - $KEY_LEN`
|
||||
for (( i = 0; i < "$AMOUNT"; i++ )); do
|
||||
KEY=$(printf "%0${KEY_LEN}d\n" $i)
|
||||
|
||||
LEN1=$(shuf -i "0-$LINE_LEN" -n 1)
|
||||
LEN2=`expr 100 - $LEN1`
|
||||
|
||||
PART1=$(shuf -er -n "$LEN1" {a..z} | paste -sd "")
|
||||
PART2=$(shuf -er -n "$LEN2" {a..z} | paste -sd "")
|
||||
|
||||
echo "$PART1$KEY$PART2" >> $FULL_PATH
|
||||
done
|
||||
|
||||
shuf "$FULL_PATH" > "$BASE/tmp.csv" && mv "$BASE/tmp.csv" "$FULL_PATH"
|
||||
|
||||
END=$(date +%s)
|
||||
SECONDS=`expr $END - $START`
|
||||
echo "$FILE generated in ${SECONDS}s"
|
|
@ -0,0 +1,28 @@
|
|||
#! /bin/bash
|
||||
|
||||
# Makes a file with a number of key values
|
||||
|
||||
START=$(date +%s)
|
||||
|
||||
AMOUNT=1000
|
||||
BASE=$(dirname $(dirname $(realpath -s $0)));
|
||||
FILE_NAME="pairs.csv"
|
||||
FULL_PATH="$BASE/csv/$FILE_NAME"
|
||||
|
||||
if [ -f "$FULL_PATH" ]; then
|
||||
rm "$FULL_PATH"
|
||||
fi
|
||||
|
||||
echo "Generating csv/$FILE_NAME..."
|
||||
|
||||
KEY_LEN=${#AMOUNT}
|
||||
for (( i = 0; i < "$AMOUNT"; i++ )); do
|
||||
KEY=$(printf "%0${KEY_LEN}d\n" $i)
|
||||
VAL=$(shuf -er -n "$KEY_LEN" {A..Z} | paste -sd "")
|
||||
|
||||
echo "$KEY,$VAL" >> "$FULL_PATH"
|
||||
done
|
||||
|
||||
END=$(date +%s)
|
||||
SECONDS=`expr $END - $START`
|
||||
echo "The generation of $AMOUNT pairs took ${SECONDS}s"
|
|
@ -0,0 +1,27 @@
|
|||
#! /bin/bash
|
||||
|
||||
# Do search and replace for large files
|
||||
|
||||
START=$(date +%s)
|
||||
BASE=$(dirname $(dirname $(realpath -s $0)));
|
||||
|
||||
CORPUS_FILE="$BASE/csv/corpus.csv"
|
||||
PAIRS_FILE="$BASE/csv/pairs.csv"
|
||||
REPLACED_FILE="$BASE/csv/replaced.csv"
|
||||
|
||||
if [ -f "$REPLACED_FILE" ]; then
|
||||
rm "$REPLACED_FILE"
|
||||
fi
|
||||
|
||||
echo "Generating replaced.csv..."
|
||||
|
||||
cp "$CORPUS_FILE" "$REPLACED_FILE"
|
||||
|
||||
while IFS=, read -r KEY VAL
|
||||
do
|
||||
sed -i -e "s/$KEY/$VAL/g" "$REPLACED_FILE"
|
||||
done < "$PAIRS_FILE"
|
||||
|
||||
END=$(date +%s)
|
||||
SECONDS=`expr $END - $START`
|
||||
echo "Replacement took ${SECONDS}s"
|
|
@ -0,0 +1,19 @@
|
|||
#! /bin/bash
|
||||
|
||||
# Tests a file called replaced.csv
|
||||
BASE=$(dirname $(dirname $(realpath -s $0)));
|
||||
REPLACED_FILE="$BASE/csv/replaced.csv"
|
||||
|
||||
if [ ! -f "$REPLACED_FILE" ]; then
|
||||
echo "File not found: $REPLACED_FILE"
|
||||
echo "Generate replacements first."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [[ `cat "$REPLACED_FILE"` =~ [0-9] ]]; then
|
||||
echo 'test NOK'
|
||||
exit 1
|
||||
else
|
||||
echo 'test OK'
|
||||
exit 0
|
||||
fi
|
|
@ -0,0 +1,13 @@
|
|||
|
||||
build:
|
||||
go build -o bin/pairs cmd/pairs/main.go
|
||||
go build -o bin/corpus cmd/corpus/main.go
|
||||
go build -o bin/replace cmd/replace/main.go
|
||||
|
||||
run:
|
||||
./bin/corpus
|
||||
./bin/pairs
|
||||
./bin/replace
|
||||
|
||||
test:
|
||||
./bin/test.sh
|
|
@ -0,0 +1,54 @@
|
|||
# Go Fast Replace
|
||||
|
||||
An implementation in Go.
|
||||
|
||||
## To build
|
||||
|
||||
````bash
|
||||
make build
|
||||
````
|
||||
|
||||
## To run
|
||||
|
||||
````bash
|
||||
make run
|
||||
````
|
||||
|
||||
## To test
|
||||
|
||||
````bash
|
||||
make test
|
||||
````
|
||||
|
||||
## Conclusions
|
||||
|
||||
### Replace
|
||||
|
||||
Current implementations:
|
||||
|
||||
- `bm` uses `strings.NewReplacer` which uses the [Boyer–Moore](https://go.dev/src/strings/search.go) algorithm.
|
||||
- `cw` uses [Commentz–Walter](https://en.wikipedia.org/wiki/Commentz-Walter_algorithm) algorithm.
|
||||
|
||||
#### Results
|
||||
|
||||
- `bm` varies a lot, but is never over _460µs_.
|
||||
- `cw` is a lot slower, at around _3.5ms_, but it can search for multiple patterns at once.
|
||||
|
||||
````shell
|
||||
$ make run
|
||||
./bin/corpus
|
||||
Creating corpus...
|
||||
./bin/pairs
|
||||
Creating pairs...
|
||||
./bin/replace
|
||||
|
||||
bm: 273.621µs
|
||||
|
||||
cw: 2.939254ms
|
||||
````
|
||||
|
||||
#### Todo:
|
||||
- Implement parallelism on the Commentz–Walter implementation.
|
||||
- [Boyer–Moore variants](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#Variants)
|
||||
- [Rabin–Karp](https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm) algorithm.
|
||||
- [Aho–Corasick](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) algorithm.
|
|
@ -0,0 +1,25 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"internal/corpus"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Println("Creating corpus...")
|
||||
f, err := os.OpenFile("./csv/corpus.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
defer f.Close()
|
||||
|
||||
lines := corpus.MakeCorpus()
|
||||
|
||||
// Write to file
|
||||
for _, line := range lines {
|
||||
f.WriteString(line + "\n")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"internal/pairs"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Println("Creating pairs...")
|
||||
f, err := os.OpenFile("./csv/pairs.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
defer f.Close()
|
||||
|
||||
lines := pairs.MakePairs()
|
||||
|
||||
// Write to file
|
||||
for _, line := range lines {
|
||||
f.WriteString(line + "\n")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
"internal/replace"
|
||||
)
|
||||
|
||||
func main() {
|
||||
run, err := getArgs()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
return
|
||||
}
|
||||
|
||||
corpusBytes, err := os.ReadFile("./csv/corpus.csv")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
return
|
||||
}
|
||||
|
||||
// Read the CSV file containing replacement key-value pairs
|
||||
pairsHandle, err := os.Open("./csv/pairs.csv")
|
||||
if err != nil {
|
||||
fmt.Println("Error opening pairs:", err)
|
||||
return
|
||||
}
|
||||
defer pairsHandle.Close()
|
||||
|
||||
replacements, err := csv.NewReader(pairsHandle).ReadAll()
|
||||
if err != nil {
|
||||
fmt.Println("Error reading pairs:", err)
|
||||
return
|
||||
}
|
||||
|
||||
corpus := string(corpusBytes)
|
||||
|
||||
var pairs []string
|
||||
var replaced string
|
||||
var startTime time.Time
|
||||
var elapsedTime time.Duration
|
||||
for _, algo := range run {
|
||||
switch algo {
|
||||
case "cw":
|
||||
// Prepare
|
||||
cw := replace.NewCommentzWalter()
|
||||
for _, record := range replacements {
|
||||
cw.AddPattern(record[0], record[1])
|
||||
}
|
||||
cw.BuildMatchingMachine()
|
||||
|
||||
// Replace
|
||||
startTime = time.Now()
|
||||
replaced = cw.CommentzWalterReplace(corpus)
|
||||
elapsedTime = time.Since(startTime)
|
||||
case "bm":
|
||||
// Prepare...
|
||||
for _, record := range replacements {
|
||||
pairs = append(pairs, record[0], record[1])
|
||||
}
|
||||
|
||||
// Replace
|
||||
startTime = time.Now()
|
||||
replaced = replace.BoyerMooreReplace(corpus, pairs)
|
||||
elapsedTime = time.Since(startTime)
|
||||
}
|
||||
fmt.Printf("\n%s: %s\n", algo, elapsedTime)
|
||||
}
|
||||
|
||||
// Write replaced text to a file
|
||||
outputHandle, err := os.OpenFile("./csv/replaced.csv", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
|
||||
if err != nil {
|
||||
fmt.Println("Error creating output file:", err)
|
||||
return
|
||||
}
|
||||
defer outputHandle.Close()
|
||||
|
||||
// Write replaced text to the file
|
||||
_, err = outputHandle.WriteString(replaced)
|
||||
if err != nil {
|
||||
fmt.Println("Error writing to file:", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func getArgs() ([]string, error) {
|
||||
var run = []string{"bm", "cw"}
|
||||
|
||||
if len(os.Args) > 1 {
|
||||
switch os.Args[1] {
|
||||
case "all":
|
||||
run = []string{"bm", "cw"}
|
||||
case "cw":
|
||||
run = []string{"cw"}
|
||||
case "bm":
|
||||
run = []string{"bm"}
|
||||
default:
|
||||
return []string{}, errors.New("Invalid argument.")
|
||||
}
|
||||
}
|
||||
|
||||
return run, nil
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
module fast-replace
|
||||
|
||||
go 1.21.5
|
||||
|
||||
require internal/corpus v1.0.0
|
||||
replace internal/corpus => ./internal/corpus
|
||||
require internal/pairs v1.0.0
|
||||
replace internal/pairs => ./internal/pairs
|
||||
require internal/replace v1.0.0
|
||||
replace internal/replace => ./internal/replace
|
|
@ -0,0 +1,45 @@
|
|||
package corpus
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
)
|
||||
|
||||
func MakeCorpus() []string {
|
||||
// Generate data
|
||||
lineLength := 100
|
||||
var lines []string
|
||||
for i := 1; i <= 1000; i++ {
|
||||
lines = append(lines, makeLine(i, lineLength))
|
||||
}
|
||||
|
||||
// Randomize
|
||||
for i := range lines {
|
||||
j := rand.Intn(i + 1)
|
||||
lines[i], lines[j] = lines[j], lines[i]
|
||||
}
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
func makeLine(line int, lineLength int) string {
|
||||
var key string = fmt.Sprintf("%06d", line)
|
||||
lineLength = lineLength - len(key)
|
||||
|
||||
var firstLen int = rand.Intn(lineLength)
|
||||
var secondLen int = lineLength - firstLen
|
||||
|
||||
return randomString(firstLen) + key + randomString(secondLen)
|
||||
}
|
||||
|
||||
func randomString(n int) string {
|
||||
var letters = []rune("abcdefghijklmnopqrstuvwxyz")
|
||||
var length int = len(letters)
|
||||
|
||||
s := make([]rune, n)
|
||||
for i := range s {
|
||||
s[i] = letters[rand.Intn(length)]
|
||||
}
|
||||
|
||||
return string(s)
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
module corpus
|
||||
|
||||
go 1.21.5
|
|
@ -0,0 +1,3 @@
|
|||
module pairs
|
||||
|
||||
go 1.21.5
|
|
@ -0,0 +1,31 @@
|
|||
package pairs
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
)
|
||||
|
||||
func MakePairs() []string {
|
||||
var pairs []string
|
||||
for i := 1; i <= 1000; i++ {
|
||||
pairs = append(pairs, makeKey(i) + "," + makeValue())
|
||||
}
|
||||
|
||||
return pairs
|
||||
}
|
||||
|
||||
func makeKey(i int) string {
|
||||
return fmt.Sprintf("%06d", i)
|
||||
}
|
||||
|
||||
func makeValue() string {
|
||||
var letters = []rune("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
||||
var length int = len(letters)
|
||||
|
||||
s := make([]rune, 6)
|
||||
for i := range s {
|
||||
s[i] = letters[rand.Intn(length)]
|
||||
}
|
||||
|
||||
return string(s)
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package replace
|
||||
|
||||
import "strings"
|
||||
|
||||
// Uses strings.NewReplacer (which implements Boyer-Moore search)
|
||||
// See https://go.dev/src/strings/search.go
|
||||
func BoyerMooreReplace(corpus string, pairs []string) string {
|
||||
replacer := strings.NewReplacer(pairs...)
|
||||
|
||||
return replacer.Replace(corpus)
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package replace
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"internal/replace"
|
||||
)
|
||||
|
||||
func TestBoyerMooreReplace(t *testing.T) {
|
||||
var corpus string = "abcdefghi012345jklmnopqrst"
|
||||
var pairs = []string{"012345", "ABCDEF"}
|
||||
|
||||
var want string = "abcdefghiABCDEFjklmnopqrst"
|
||||
got := replace.BoyerMooreReplace(corpus, pairs)
|
||||
|
||||
if want != got {
|
||||
t.Error("Replacement was wrong.")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package replace
|
||||
|
||||
type Node struct {
|
||||
children map[rune]*Node
|
||||
output int
|
||||
fail *Node
|
||||
}
|
||||
|
||||
func NewNode() *Node {
|
||||
return &Node{
|
||||
children: make(map[rune]*Node),
|
||||
output: -1,
|
||||
fail: nil,
|
||||
}
|
||||
}
|
||||
|
||||
type CommentzWalter struct {
|
||||
root *Node
|
||||
patterns []string
|
||||
replacements map[int]string
|
||||
}
|
||||
|
||||
func NewCommentzWalter() *CommentzWalter {
|
||||
return &CommentzWalter{
|
||||
root: NewNode(),
|
||||
patterns: make([]string, 0),
|
||||
replacements: make(map[int]string),
|
||||
}
|
||||
}
|
||||
|
||||
func (cw *CommentzWalter) AddPattern(pattern string, replacement string) {
|
||||
cw.patterns = append(cw.patterns, pattern)
|
||||
idx := len(cw.patterns) - 1
|
||||
cw.replacements[idx] = replacement
|
||||
}
|
||||
|
||||
func (cw *CommentzWalter) BuildMatchingMachine() {
|
||||
for idx, pattern := range cw.patterns {
|
||||
current := cw.root
|
||||
for _, char := range pattern {
|
||||
if _, exists := current.children[char]; !exists {
|
||||
current.children[char] = NewNode()
|
||||
}
|
||||
current = current.children[char]
|
||||
}
|
||||
current.output = idx
|
||||
}
|
||||
|
||||
queue := make([]*Node, 0)
|
||||
for _, node := range cw.root.children {
|
||||
queue = append(queue, node)
|
||||
node.fail = cw.root
|
||||
}
|
||||
|
||||
for len(queue) > 0 {
|
||||
r := queue[0]
|
||||
queue = queue[1:]
|
||||
|
||||
for char, child := range r.children {
|
||||
queue = append(queue, child)
|
||||
failNode := r.fail
|
||||
for failNode != nil && failNode.children[char] == nil {
|
||||
failNode = failNode.fail
|
||||
}
|
||||
|
||||
if failNode == nil {
|
||||
child.fail = cw.root
|
||||
} else {
|
||||
child.fail = failNode.children[char]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (cw *CommentzWalter) CommentzWalterReplace(text string) string {
|
||||
matches := make([]string, 0)
|
||||
current := cw.root
|
||||
runes := []rune(text)
|
||||
replaced := make([]rune, 0)
|
||||
|
||||
for i, char := range runes {
|
||||
for current != nil && current.children[char] == nil {
|
||||
current = current.fail
|
||||
}
|
||||
|
||||
if current == nil {
|
||||
current = cw.root
|
||||
replaced = append(replaced, runes[i])
|
||||
continue
|
||||
}
|
||||
|
||||
current = current.children[char]
|
||||
if current.output != -1 {
|
||||
matchIdx := current.output
|
||||
matches = append(matches, cw.replacements[matchIdx])
|
||||
replRunes := []rune(cw.replacements[matchIdx])
|
||||
|
||||
for _, rn := range replRunes {
|
||||
replaced = append(replaced, rn)
|
||||
}
|
||||
|
||||
patternLen := len(cw.patterns[matchIdx])
|
||||
replaceLen := len(cw.replacements[matchIdx])
|
||||
if (patternLen > replaceLen) {
|
||||
// Skip index to after the pattern
|
||||
dif := patternLen - replaceLen
|
||||
i = i + dif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return string(replaced)
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
module replace
|
||||
|
||||
go 1.21.5
|
Loading…
Reference in New Issue