247 lines
5.9 KiB
Go
247 lines
5.9 KiB
Go
package misspell
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"io"
|
|
"regexp"
|
|
"strings"
|
|
"text/scanner"
|
|
)
|
|
|
|
func max(x, y int) int {
|
|
if x > y {
|
|
return x
|
|
}
|
|
return y
|
|
}
|
|
|
|
func inArray(haystack []string, needle string) bool {
|
|
for _, word := range haystack {
|
|
if needle == word {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
var wordRegexp = regexp.MustCompile(`[a-zA-Z0-9']+`)
|
|
|
|
// Diff is datastructure showing what changed in a single line
|
|
type Diff struct {
|
|
Filename string
|
|
FullLine string
|
|
Line int
|
|
Column int
|
|
Original string
|
|
Corrected string
|
|
}
|
|
|
|
// Replacer is the main struct for spelling correction
|
|
type Replacer struct {
|
|
Replacements []string
|
|
Debug bool
|
|
engine *StringReplacer
|
|
corrected map[string]string
|
|
}
|
|
|
|
// New creates a new default Replacer using the main rule list
|
|
func New() *Replacer {
|
|
r := Replacer{
|
|
Replacements: DictMain,
|
|
}
|
|
r.Compile()
|
|
return &r
|
|
}
|
|
|
|
// RemoveRule deletes existings rules.
|
|
// TODO: make inplace to save memory
|
|
func (r *Replacer) RemoveRule(ignore []string) {
|
|
newwords := make([]string, 0, len(r.Replacements))
|
|
for i := 0; i < len(r.Replacements); i += 2 {
|
|
if inArray(ignore, r.Replacements[i]) {
|
|
continue
|
|
}
|
|
newwords = append(newwords, r.Replacements[i:i+2]...)
|
|
}
|
|
r.engine = nil
|
|
r.Replacements = newwords
|
|
}
|
|
|
|
// AddRuleList appends new rules.
|
|
// Input is in the same form as Strings.Replacer: [ old1, new1, old2, new2, ....]
|
|
// Note: does not check for duplictes
|
|
func (r *Replacer) AddRuleList(additions []string) {
|
|
r.engine = nil
|
|
r.Replacements = append(r.Replacements, additions...)
|
|
}
|
|
|
|
// Compile compiles the rules. Required before using the Replace functions
|
|
func (r *Replacer) Compile() {
|
|
|
|
r.corrected = make(map[string]string, len(r.Replacements)/2)
|
|
for i := 0; i < len(r.Replacements); i += 2 {
|
|
r.corrected[r.Replacements[i]] = r.Replacements[i+1]
|
|
}
|
|
r.engine = NewStringReplacer(r.Replacements...)
|
|
}
|
|
|
|
/*
|
|
line1 and line2 are different
|
|
extract words from each line1
|
|
|
|
replace word -> newword
|
|
if word == new-word
|
|
continue
|
|
if new-word in list of replacements
|
|
continue
|
|
new word not original, and not in list of replacements
|
|
some substring got mixed up. UNdo
|
|
*/
|
|
func (r *Replacer) recheckLine(s string, lineNum int, buf io.Writer, next func(Diff)) {
|
|
first := 0
|
|
redacted := RemoveNotWords(s)
|
|
|
|
idx := wordRegexp.FindAllStringIndex(redacted, -1)
|
|
for _, ab := range idx {
|
|
word := s[ab[0]:ab[1]]
|
|
newword := r.engine.Replace(word)
|
|
if newword == word {
|
|
// no replacement done
|
|
continue
|
|
}
|
|
|
|
// ignore camelCase words
|
|
// https://github.com/client9/misspell/issues/113
|
|
if CaseStyle(word) == CaseUnknown {
|
|
continue
|
|
}
|
|
|
|
if StringEqualFold(r.corrected[strings.ToLower(word)], newword) {
|
|
// word got corrected into something we know
|
|
io.WriteString(buf, s[first:ab[0]])
|
|
io.WriteString(buf, newword)
|
|
first = ab[1]
|
|
next(Diff{
|
|
FullLine: s,
|
|
Line: lineNum,
|
|
Original: word,
|
|
Corrected: newword,
|
|
Column: ab[0],
|
|
})
|
|
continue
|
|
}
|
|
// Word got corrected into something unknown. Ignore it
|
|
}
|
|
io.WriteString(buf, s[first:])
|
|
}
|
|
|
|
// ReplaceGo is a specialized routine for correcting Golang source
|
|
// files. Currently only checks comments, not identifiers for
|
|
// spelling.
|
|
func (r *Replacer) ReplaceGo(input string) (string, []Diff) {
|
|
var s scanner.Scanner
|
|
s.Init(strings.NewReader(input))
|
|
s.Mode = scanner.ScanIdents | scanner.ScanFloats | scanner.ScanChars | scanner.ScanStrings | scanner.ScanRawStrings | scanner.ScanComments
|
|
lastPos := 0
|
|
output := ""
|
|
Loop:
|
|
for {
|
|
switch s.Scan() {
|
|
case scanner.Comment:
|
|
origComment := s.TokenText()
|
|
newComment := r.engine.Replace(origComment)
|
|
|
|
if origComment != newComment {
|
|
// s.Pos().Offset is the end of the current token
|
|
// subtract len(origComment) to get the start of the token
|
|
offset := s.Pos().Offset
|
|
output = output + input[lastPos:offset-len(origComment)] + newComment
|
|
lastPos = offset
|
|
}
|
|
case scanner.EOF:
|
|
break Loop
|
|
}
|
|
}
|
|
|
|
if lastPos == 0 {
|
|
// no changes, no copies
|
|
return input, nil
|
|
}
|
|
if lastPos < len(input) {
|
|
output = output + input[lastPos:]
|
|
}
|
|
diffs := make([]Diff, 0, 8)
|
|
buf := bytes.NewBuffer(make([]byte, 0, max(len(input), len(output))+100))
|
|
// faster that making a bytes.Buffer and bufio.ReadString
|
|
outlines := strings.SplitAfter(output, "\n")
|
|
inlines := strings.SplitAfter(input, "\n")
|
|
for i := 0; i < len(inlines); i++ {
|
|
if inlines[i] == outlines[i] {
|
|
buf.WriteString(outlines[i])
|
|
continue
|
|
}
|
|
r.recheckLine(inlines[i], i+1, buf, func(d Diff) {
|
|
diffs = append(diffs, d)
|
|
})
|
|
}
|
|
|
|
return buf.String(), diffs
|
|
|
|
}
|
|
|
|
// Replace is corrects misspellings in input, returning corrected version
|
|
// along with a list of diffs.
|
|
func (r *Replacer) Replace(input string) (string, []Diff) {
|
|
output := r.engine.Replace(input)
|
|
if input == output {
|
|
return input, nil
|
|
}
|
|
diffs := make([]Diff, 0, 8)
|
|
buf := bytes.NewBuffer(make([]byte, 0, max(len(input), len(output))+100))
|
|
// faster that making a bytes.Buffer and bufio.ReadString
|
|
outlines := strings.SplitAfter(output, "\n")
|
|
inlines := strings.SplitAfter(input, "\n")
|
|
for i := 0; i < len(inlines); i++ {
|
|
if inlines[i] == outlines[i] {
|
|
buf.WriteString(outlines[i])
|
|
continue
|
|
}
|
|
r.recheckLine(inlines[i], i+1, buf, func(d Diff) {
|
|
diffs = append(diffs, d)
|
|
})
|
|
}
|
|
|
|
return buf.String(), diffs
|
|
}
|
|
|
|
// ReplaceReader applies spelling corrections to a reader stream. Diffs are
|
|
// emitted through a callback.
|
|
func (r *Replacer) ReplaceReader(raw io.Reader, w io.Writer, next func(Diff)) error {
|
|
var (
|
|
err error
|
|
line string
|
|
lineNum int
|
|
)
|
|
reader := bufio.NewReader(raw)
|
|
for err == nil {
|
|
lineNum++
|
|
line, err = reader.ReadString('\n')
|
|
|
|
// if it's EOF, then line has the last line
|
|
// don't like the check of err here and
|
|
// in for loop
|
|
if err != nil && err != io.EOF {
|
|
return err
|
|
}
|
|
// easily 5x faster than regexp+map
|
|
if line == r.engine.Replace(line) {
|
|
io.WriteString(w, line)
|
|
continue
|
|
}
|
|
// but it can be inaccurate, so we need to double check
|
|
r.recheckLine(line, lineNum, w, next)
|
|
}
|
|
return nil
|
|
}
|