I have similar solution but might be useful for pandas
import math
import re
from collections import Counter
import pandas as pd
WORD = re.compile(r"\w+")
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
df=pd.read_csv('/content/drive/article.csv')
df['vector1']=df['headline'].apply(lambda x: text_to_vector(x))
df['vector2']=df['snippet'].apply(lambda x: text_to_vector(x))
df['simscore']=df.apply(lambda x: get_cosine(x['vector1'],x['vector2']),axis=1)