cdZWj
/
en2cn


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
							

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

from my_config import LANG_EMB_MODEL

tokenizer = AutoTokenizer.from_pretrained(LANG_EMB_MODEL["cn"])
model = AutoModel.from_pretrained(LANG_EMB_MODEL["cn"])  # , from_tf=True


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def item2emb_cn(sentences):
    # Sentences we want sentence embeddings for
    # sentences = ['This is an example sentence', 'Each sentence is converted']
    # # Load model from HuggingFace Hub
    # # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    #
    # # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings


if __name__ == '__main__':
    from sentence_transformers import util
    sentences = ['现货团湿疹克星来啦纽强二代顺峰宝宝洗护全系列开团宝宝的肌肤守护天使刮码发货正品保证',
                 '简单直接的欢乐客积木桌来了最新品哦',
                 '宝宝拍新品儿童创意沙画激发孩子的想象力和颜色搭配能力预售18号发货',
                 '北鼎多功能G56家用蒸炖锅电蒸锅隔水炖盅全自动可预约好收纳高颜值']

    ss = item2emb_cn(["我不会", "欣赏"])
    b = util.cos_sim(ss[0], ss[1:])
    print(b)

    # "因此", "总之"