1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- from transformers import AutoTokenizer, AutoModel
- import torch
- import torch.nn.functional as F
- from my_config import LANG_EMB_MODEL
- tokenizer = AutoTokenizer.from_pretrained(LANG_EMB_MODEL["cn"])
- model = AutoModel.from_pretrained(LANG_EMB_MODEL["cn"]) # , from_tf=True
- # Mean Pooling - Take attention mask into account for correct averaging
- def mean_pooling(model_output, attention_mask):
- token_embeddings = model_output[0] # First element of model_output contains all token embeddings
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
- def item2emb_cn(sentences):
- # Sentences we want sentence embeddings for
- # sentences = ['This is an example sentence', 'Each sentence is converted']
- # # Load model from HuggingFace Hub
- # # Tokenize sentences
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
- #
- # # Compute token embeddings
- with torch.no_grad():
- model_output = model(**encoded_input)
- # Perform pooling
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
- # Normalize embeddings
- sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
- return sentence_embeddings
- if __name__ == '__main__':
- from sentence_transformers import util
- sentences = ['现货团湿疹克星来啦纽强二代顺峰宝宝洗护全系列开团宝宝的肌肤守护天使刮码发货正品保证',
- '简单直接的欢乐客积木桌来了最新品哦',
- '宝宝拍新品儿童创意沙画激发孩子的想象力和颜色搭配能力预售18号发货',
- '北鼎多功能G56家用蒸炖锅电蒸锅隔水炖盅全自动可预约好收纳高颜值']
- ss = item2emb_cn(["我不会", "欣赏"])
- b = util.cos_sim(ss[0], ss[1:])
- print(b)
- # "因此", "总之"
|