import os
import re
import sys
sys.path.append("/home/cv/workspace/tujintao/document_segmentation")
from Utils.main_clear.sci_clear import non_data_latex_iter
filename = "Data/samples"
def read_data(directory):
all_documents = []
all_labels = []
for filename in os.listdir(directory)[:2]:
if filename.endswith(".txt"):
filepath = os.path.join(directory, filename)
# print(filepath)
# 读取txt文件内容并处理每一行结尾
with open(filepath, "r", encoding="utf-8") as file:
lines = file.readlines()
# for i in range(len(lines)):
# if not lines[i].endswith("
------------------------1\n"):
# lines[i] = re.sub(r'------------------------1$', '
------------------------1\n', lines[i])
# 将所有行的内容拼接为一行,并清除无关符号
text = "".join(lines)
text = re.sub(r'