基于Go的AC自动机

2023-11-10 10:00

|

1,128

|

0

|

358 字

|

13 分钟

基于Go的AC自动机

用于在文本中查找和提取关键词。它使用高效的算法和数据结构，可以快速匹配大量的关键词，并返回匹配结果。

git地址：https://git.echol.cn/loser/keyword-extraction

可以用于智能客服，例如根据关键词回复指定内容

keywordprocessor.go

package extractor

import (
    "unicode"
    "unicode/utf8"
)

type WalkFn func(start, end int) bool

type KeywordProcessor struct {
    root              *Node
    whiteSpaceChars   []string // 不知道有啥用，反正python也定义了，就跟着定义一个吧
    nonWordBoundaries string
    caseSensitive     bool // 匹配是否区分大小写
}

func NewKeywordProcessor(caseSensitive bool) *KeywordProcessor {
    return &KeywordProcessor{
        root:            newNode(),
        caseSensitive:   caseSensitive,
        whiteSpaceChars: []string{".", "\t", "\n", "\a", " ", ","}, // python 就是这样定义的，而且它也没有用到，所以先留着这里吧
    }
}

func (kp *KeywordProcessor) setItem(keyword string) {
    if len(keyword) == 0 {
        return
    }

    node := kp.root
    for _, char := range keyword {
        if !kp.caseSensitive {
            char = unicode.ToLower(char)
        }
        if _, ok := node.children[char]; !ok {
            // 子节点已存在 将子节点设为当前节点
            node.children[char] = newNode()
        }
        node = node.children[char]
    }
    // 记录当前匹配词的长度
    node.exist[len(keyword)] = struct{}{}
}

func (kp *KeywordProcessor) Build() {
    // 创建一个空队列，用于层级遍历 Trie 树节点
    queue := make([]*Node, 0)
    // 将根节点作为初始节点
    queue = append(queue, kp.root)

    for len(queue) > 0 {
        // 获取队列中的第一个节点
        currentNode := queue[0]
        // 弹出队列中的第一个节点
        queue = queue[1:]

        // 遍历当前节点的子节点
        for char, childNode := range currentNode.children {
            // 将子节点添加到队列中
            queue = append(queue, childNode)
            // 当前节点父节点的失败指针
            faFail := currentNode.failure

            // 递归获取父失败指针 直到获取到根节点为止
            for faFail != nil && faFail.children[char] == nil {
                faFail = faFail.failure
            }
            childNode.failure = kp.root
            if faFail != nil {
                childNode.failure = faFail.children[char]
            }
            for key := range childNode.failure.exist {
                childNode.exist[key] = struct{}{}
            }
        }
    }
}
func (kp *KeywordProcessor) AddKeyWord(keyword string) *KeywordProcessor {
    kp.setItem(keyword)
    return kp
}

func (kp *KeywordProcessor) AddKeywordsFromList(keywords []string) *KeywordProcessor {
    for _, keyword := range keywords {
        kp.setItem(keyword)
    }
    return kp
}

func (kp *KeywordProcessor) walk(sentence string, wf WalkFn) {
    // 从根节点开始查找
    currentNode := kp.root
    // 遍历文本 sentence 的每个字符，并记录当前字符的索引为 idx，当前字符为 r
    idx := 0
    for len(sentence) > 0 {
        r, l := utf8.DecodeRuneInString(sentence)
        idx += l
        sentence = sentence[l:]
        if !kp.caseSensitive {
            r = unicode.ToLower(r)
        }
        // 在循环中 判断是否有当前字符子节点
        //如果不存在，则说明匹配失败，需要通过失败路径回溯到前一个节点，直到找到一个匹配的子节点或回溯到根节点。
        for currentNode.children[r] == nil && currentNode.failure != nil {
            currentNode = currentNode.failure
        }
        if currentNode.children[r] == nil {
            continue
        }
        currentNode = currentNode.children[r]
        for length := range currentNode.exist {
            if !wf(idx-length, idx) {
                return
            }
        }
    }
}

func (kp *KeywordProcessor) walkByte(sentence []byte, wf WalkFn) {
    // 从根节点开始查找
    currentNode := kp.root
    // 遍历文本 sentence 的每个字符，并记录当前字符的索引为 idx，当前字符为 r
    idx := 0
    for len(sentence) > 0 {
        r, l := utf8.DecodeRune(sentence)
        idx += l
        sentence = sentence[l:]
        if !kp.caseSensitive {
            r = unicode.ToLower(r)
        }
        // 在循环中 判断是否有当前字符子节点
        //如果不存在，则说明匹配失败，需要通过失败路径回溯到前一个节点，直到找到一个匹配的子节点或回溯到根节点。
        for currentNode.children[r] == nil && currentNode.failure != nil {
            currentNode = currentNode.failure
        }
        if currentNode.children[r] == nil {
            continue
        }
        currentNode = currentNode.children[r]
        for length := range currentNode.exist {
            if !wf(idx-length, idx) {
                return
            }
        }
    }
}

// ExtractKeywords 匹配关键词
func (kp *KeywordProcessor) ExtractKeywords(sentence string) []Match {
    var matches []Match
    if len(sentence) == 0 {
        return matches
    }

    kp.walk(sentence, func(start, end int) bool {
        matches = append(matches, Match{
            start: start,
            end:   end,
            match: sentence[start:end],
        })
        return true
    })
    return matches
}

func (kp *KeywordProcessor) ExtractKeywordsFromBytes(sentence []byte) []Match {
    var matches []Match
    if len(sentence) == 0 {
        return matches
    }

    kp.walkByte(sentence, func(start, end int) bool {
        matches = append(matches, Match{
            start: start,
            end:   end,
            match: string(sentence[start:end]),
        })
        return true
    })
    return matches
}

trie.go

package extractor

type Node struct {
    children map[rune]*Node   // 使用 map 存储叶子节点,key:'char' ,value: *Node
    exist    map[int]struct{} // 关键词节点是一个完整的匹配词，记录其长度
    failure  *Node            // 记录失败指针
}

func newNode() *Node {
    return &Node{
        children: make(map[rune]*Node),
        exist:    make(map[int]struct{}),
    }
}

type Match struct {
    match string
    start int
    end   int
}

func (m *Match) MatchString() string {
    return m.match
}

func (m *Match) Start() int {
    return m.start
}

func (m *Match) End() int {
    return m.end
}

使用示例

import (
    "git.echol.cn/loser/keyword-extraction"
    "fmt"
)

func main() {
    // 创建一个关键词处理器，不区分大小写
    kp := extractor.NewKeywordProcessor(false)

    // 添加关键词
    kp.AddKeyword("apple")
    kp.AddKeyword("banana")
    kp.AddKeyword("orange")

    // 提取关键词
    sentence := "I have an Apple and a Banana."
    matches := kp.ExtractKeywords(sentence)

    // 打印匹配结果
    for _, match := range matches {
        fmt.Println("Match:", match.Match)
        fmt.Println("Start:", match.Start)
        fmt.Println("End:", match.End)
        fmt.Println("---")
    }
}

输出：

Match: Apple
Start: 9
End: 13
---
Match: Banana
Start: 20
End: 25
---

ac自动机 go 关键词提取自动机

暂无评论

发送评论编辑评论

Markdown

悄悄话

邮件提醒

|´・ω・)ノ

ヾ(≧∇≦*)ゝ

(☆ω☆)

（╯‵□′）╯︵┴─┴

￣﹃￣

(/ω＼)

∠( ᐛ 」∠)＿

(๑•̀ㅁ•́ฅ)

→_→

୧(๑•̀⌄•́๑)૭

٩(ˊᗜˋ*)و

(ノ°ο°)ノ

(´இ皿இ｀)

⌇●﹏●⌇

(ฅ´ω`ฅ)

(╯°A°)╯︵○○○

φ(￣∇￣o)

ヾ(´･･｀｡)ノ"

( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃

(ó﹏ò｡)

Σ(っ °Д °;)っ

( ,,´･ω･)ﾉ"(´っω･｀｡)

╮(╯▽╰)╭

o(*////▽////*)q

＞﹏＜

( ๑´•ω•) "(ㆆᴗㆆ)

颜文字

Emoji

小恐龙

花!