1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- import re
- import jieba
- from config import stop_words_path
- class Word_Segment():
- def __init__(self, n_grams_flag=False):
- # 读取停用词表
- with open(stop_words_path, 'r', encoding='utf8', errors='ignore') as f:
- self.stop_words = set(f.read().split('\n'))
- self.n_grams_flag = n_grams_flag
- # 加载jieba模型
- load_jieba = jieba.lcut('load_jieba')
- def __call__(self, sentence):
- sentence = re.sub( r"[\-_—]", ' ', sentence)
- # 统一将大写转化为小写
- seg_list = jieba.lcut(sentence.lower())
- # 将分词列表中的数字变为空字符串
- seg_list = [re.sub(r'\d+\.?\d*%?', ' ', w) for w in seg_list]
- # 若词为停用词,则变为' '(方便后续进行n-grams组合)
- seg_list = [w if w not in self.stop_words else ' ' for w in seg_list]
- if self.n_grams_flag is True:
- seg_list = self.n_grams(seg_list)
-
- return [w.strip() for w in seg_list if w.strip() != '']
- # 计算分词后的词语n-grams组合
- def n_grams(self, seg_list):
- length = len(seg_list)
- for i in range(length):
- if i+1 < length and self.is_Chinese(seg_list[i]) and self.is_Chinese(seg_list[i+1]):
- seg_list.append(seg_list[i]+seg_list[i+1])
- return seg_list
- # 判断字符串是否为全为中文字符
- def is_Chinese(self, string):
- for char in string:
- if '\u4e00' <= char <= '\u9fff':
- continue
- else:
- return False
- return True if string != '' else False
- if __name__ == "__main__":
- sentence = "中国选手张湘祥在奥运会上获得男子举重62*kg级冠军,挺举成绩是176*kg,图为他比赛时的照片"
- sentence = "f = m * a"
- word_seg = Word_Segment()
- seg_list = word_seg(sentence)
- print(seg_list)
|