import re import jieba from config import stop_words_path class Word_Segment(): def __init__(self, n_grams_flag=False): # 读取停用词表 with open(stop_words_path, 'r', encoding='utf8', errors='ignore') as f: self.stop_words = set(f.read().split('\n')) self.n_grams_flag = n_grams_flag # 加载jieba模型 load_jieba = jieba.lcut('load_jieba') def __call__(self, sentence): sentence = re.sub( r"[\-_—]", ' ', sentence) # 统一将大写转化为小写 seg_init_list = jieba.lcut(sentence.lower()) # 将分词列表中的数字变为空字符串 seg_list = [re.sub(r'\d+\.?\d*%?', ' ', w) for w in seg_init_list] # 若词为停用词,则变为' '(方便后续进行n-grams组合) seg_list = [w if w not in self.stop_words else ' ' for w in seg_list] if self.n_grams_flag is True: seg_list = self.n_grams(seg_list) seg_list = [w.strip() for w in seg_list if w.strip() != ''] return seg_list, seg_init_list # 计算分词后的词语n-grams组合 def n_grams(self, seg_list): length = len(seg_list) for i in range(length): if i+1 < length and self.is_Chinese(seg_list[i]) and self.is_Chinese(seg_list[i+1]): seg_list.append(seg_list[i] + seg_list[i+1]) return seg_list # 判断字符串是否为全为中文字符 def is_Chinese(self, string): for char in string: if '\u4e00' <= char <= '\u9fff': continue else: return False return True if string != '' else False if __name__ == "__main__": sentence = "中国选手张湘祥在奥运会上获得男子举重62*kg级冠军,挺举成绩是176*kg,图为他比赛时的照片" sentence = "f = m * a" word_seg = Word_Segment() seg_list = word_seg(sentence) print(seg_list)