文章预览
原理下一篇讲,先贴出代码 https://github.com/gomate-community/GoMate/blob/main/gomate/modules/retrieval/bm25_retriever.py import logging import math from multiprocessing import Pool, cpu_count from typing import List,Dict import jieba import numpy as np import tiktoken from gomate.modules.retrieval.retrievers import BaseRetriever jieba.setLogLevel(logging.INFO) def tokenizer(text: str): return [word for word in jieba.cut(text)] class BM25: def __init__(self, corpus, tokenizer=None): self.corpus_size = 0 self.avgdl = 0 self.doc_freqs = [] self.idf = {} self.doc_len = [] self.tokenizer = tokenizer if tokenizer: corpus = self._tokenize_corpus(corpus) nd = self._initialize(corpus) self._calc_idf(nd) def _initialize(self, corpus): nd = {}
………………………………