ir_db_establish.py 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import os
  2. import time
  3. import shutil
  4. import sqlite3
  5. from config import sqlite_path, sqlite_copy_path, mongo_coll
  6. from word_segment import Word_Segment
  7. # 建立倒排索引
  8. class Inverted_Index_Establish():
  9. def __init__(self, n_grams_flag=False):
  10. # 获取mongodb数据
  11. self.mongo_coll = mongo_coll
  12. # 获取数据库地址
  13. self.sqlite_path = sqlite_path
  14. self.sqlite_copy_path = sqlite_copy_path
  15. # 分词算法
  16. self.word_seg = Word_Segment(n_grams_flag=n_grams_flag)
  17. def __call__(self):
  18. # 从mongodb读取数据集
  19. origin_dataset = self.mongo_coll.find(no_cursor_timeout=True, batch_size=5)
  20. # 将数据构建倒排索引
  21. inverted_index = self.inverted_index_compute(origin_dataset)
  22. # 将倒排索引数据写入sqlite
  23. self.sqlite_write(inverted_index)
  24. # 备份sqlite数据库,防止出现损坏
  25. shutil.copy(self.sqlite_path, self.sqlite_copy_path)
  26. # 读取数据构建倒排索引
  27. def inverted_index_compute(self, origin_dataset):
  28. start = time.time()
  29. # 倒排索引词典
  30. inverted_index = dict()
  31. # 计算文档总长度,用于计算平均长度
  32. all_doc_length = 0
  33. for i,data in enumerate(origin_dataset):
  34. if "content_clear" not in data:
  35. continue
  36. seg_list = self.word_seg(data["content_clear"])
  37. # 计算每篇文档长度和总文档长度
  38. doc_length = len(seg_list)
  39. all_doc_length += doc_length
  40. # 存储每个文档中每个词的出现次数
  41. tf_dict = dict()
  42. for word in seg_list:
  43. tf_dict[word] = tf_dict.get(word, 0) + 1
  44. # 将每个term对应的文档按固定格式存入倒排索引词典
  45. for term,tf in tf_dict.items():
  46. # 文档间通过"\n"分隔,文档内通过"\t"分隔(term_freq, doc_length)
  47. doc_info = "{}\t{}".format(tf, doc_length)
  48. if term not in inverted_index:
  49. inverted_index[term] = [str(data["id"]), doc_info]
  50. else:
  51. inverted_index[term][0] += "\n" + str(data["id"])
  52. inverted_index[term][1] += "\n" + doc_info
  53. # 获取文档总数与文档总长度存入倒排索引词典
  54. inverted_index["doc_data_statistics"] = [i + 1, str(all_doc_length)]
  55. print(time.time()-start)
  56. print("倒排索引计算完毕")
  57. return inverted_index
  58. # 将倒排索引数据写入sqlite
  59. def sqlite_write(self, inverted_index):
  60. # 若已存在sqlite数据库,则删除数据库重建
  61. if os.path.exists(self.sqlite_path):
  62. os.remove(self.sqlite_path)
  63. # 建立sqlite数据库链接
  64. sqlite_conn = sqlite3.connect(self.sqlite_path)
  65. # 创建游标对象cursor
  66. cursor = sqlite_conn.cursor()
  67. # 创建数据表
  68. cursor.execute("DROP TABLE IF EXISTS physics")
  69. cursor.execute("CREATE TABLE physics (term TEXT PRIMARY KEY, doc_freq TEXT, docs TEXT)")
  70. for key, value in inverted_index.items():
  71. t = (key, value[0], value[1])
  72. cursor.execute("INSERT INTO physics VALUES (?, ?, ?)", t)
  73. # 提交事务
  74. sqlite_conn.commit()
  75. # 关闭数据库连接
  76. cursor.close()
  77. sqlite_conn.close()
  78. if __name__ == "__main__":
  79. # 初始化倒排索引建立(n-grams为True)
  80. sql_etl = Inverted_Index_Establish(True)
  81. sql_etl()