word_segment.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import re
  2. import jieba
  3. from config import stop_words_path
  4. class Word_Segment():
  5. def __init__(self, n_grams_flag=False):
  6. # 读取停用词表
  7. with open(stop_words_path, 'r', encoding='utf8', errors='ignore') as f:
  8. self.stop_words = set(f.read().split('\n'))
  9. self.n_grams_flag = n_grams_flag
  10. # 加载jieba模型
  11. load_jieba = jieba.lcut('load_jieba')
  12. def __call__(self, sentence):
  13. sentence = re.sub( r"[\-_—]", ' ', sentence)
  14. # 统一将大写转化为小写
  15. seg_list = jieba.lcut(sentence.lower())
  16. # 将分词列表中的数字变为空字符串
  17. seg_list = [re.sub(r'\d+\.?\d*%?', ' ', w) for w in seg_list]
  18. # 若词为停用词,则变为' '(方便后续进行n-grams组合)
  19. seg_list = [w if w not in self.stop_words else ' ' for w in seg_list]
  20. if self.n_grams_flag is True:
  21. seg_list = self.n_grams(seg_list)
  22. return [w.strip() for w in seg_list if w.strip() != '']
  23. # 计算分词后的词语n-grams组合
  24. def n_grams(self, seg_list):
  25. length = len(seg_list)
  26. for i in range(length):
  27. if i+1 < length and self.is_Chinese(seg_list[i]) and self.is_Chinese(seg_list[i+1]):
  28. seg_list.append(seg_list[i]+seg_list[i+1])
  29. return seg_list
  30. # 判断字符串是否为全为中文字符
  31. def is_Chinese(self, string):
  32. for char in string:
  33. if '\u4e00' <= char <= '\u9fff':
  34. continue
  35. else:
  36. return False
  37. return True if string != '' else False
  38. if __name__ == "__main__":
  39. sentence = "中国选手张湘祥在奥运会上获得男子举重62*kg级冠军,挺举成绩是176*kg,图为他比赛时的照片"
  40. sentence = "f = m * a"
  41. word_seg = Word_Segment()
  42. seg_list = word_seg(sentence)
  43. print(seg_list)