from transformers import AutoTokenizer, AutoModel from sentence_transformers import util import torch import torch.nn.functional as F from my_config import LANG_EMB_MODEL tokenizer = AutoTokenizer.from_pretrained(LANG_EMB_MODEL["eng"]) model = AutoModel.from_pretrained(LANG_EMB_MODEL["eng"]) # Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def item2emb_eng(sentences): # Sentences we want sentence embeddings for # sentences = ['This is an example sentence', 'Each sentence is converted'] # # Load model from HuggingFace Hub # # Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') # # # Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) # Perform pooling sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) # Normalize embeddings sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) return sentence_embeddings if __name__ == '__main__': sentences = ['现货团湿疹克星来啦纽强二代顺峰宝宝洗护全系列开团宝宝的肌肤守护天使刮码发货正品保证', '简单直接的欢乐客积木桌来了最新品哦', '宝宝拍新品儿童创意沙画激发孩子的想象力和颜色搭配能力预售18号发货', '北鼎多功能G56家用蒸炖锅电蒸锅隔水炖盅全自动可预约好收纳高颜值'] # item2emb_cn(sentences) ss = item2emb_eng(["A highly recommended book", " a highly recommendable book"]) print(ss) # b = util.cos_sim(ss[0], ss[1:]) b = util.cos_sim(ss, ss) # print(a) print(b)