1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- # -*- coding:utf-8 -*-
- # import requests
- # from flask import Flask,request
- # from datetime import timedelta
- #
- # app = Flask(__name__)
- #
- #
- # @app.route('/bridge', methods=['POST', 'GET'])
- # def requests_bridge():
- # if request.method == 'post':
- # req_Info = request.json
- # if req_Info['method'] == 'get':
- # return requests.get(req_Info['url'])
- # elif req_Info['method'] == 'post':
- # if 'json' in req_Info:
- # return requests.post(req_Info['url'],json=req_Info['json'])
- # if 'data' in req_Info:
- # return requests.post(req_Info['url'], data=req_Info['data'])
- #
- #
- # if __name__ == '__main__':
- # app.run('0.0.0.0',8989)
- a = 0
- with open('D:\BaiduNetdiskDownload\sgns.literature.bigram-char\sgns.literature.bigram-char',encoding='utf8') as f:
- for i in f:
- a+=1
- if a>5:
- break
- print(i)
- import bcolz
- def load_embeddings(folder_path):
- """从 bcolz 加载 词/字 向量
- Args:
- - folder_path (str): 解压后的 bcolz rootdir(如 zh.64),
- 里面包含 2 个子目录 embeddings 和 words,
- 分别存储 嵌入向量 和 词(字)典
- Returns:
- - words (bcolz.carray): 词(字)典列表(bcolz carray 具有和 numpy array 类似的接口)
- - embeddings (bcolz.carray): 嵌入矩阵,每 1 行为 1 个 词向量/字向量,
- 其行号即为该 词(字) 在 words 中的索引编号
- """
- folder_path = folder_path.rstrip('/')
- words = bcolz.carray(rootdir='%s/words'%folder_path, mode='r')
- embeddings = bcolz.carray(rootdir='%s/embeddings'%folder_path, mode='r')
- return words, embeddings
- folder_path = r'C:\Users\mayn\Downloads\zh_char.64'
- words, embeddings = load_embeddings(folder_path)
- with open('char64d.txt','w',encoding='utf8') as f:
- f.write(str(len(words))+' '+'64\n')
- for i,j in zip(words, embeddings):
- f.write(i+' '+' '.join([str(n) for n in j])+'\n')
|