The builtin SequenceMatcher
is very slow on large input, here's how it can be done with diff-match-patch:
from diff_match_patch import diff_match_patch
def compute_similarity_and_diff(text1, text2):
dmp = diff_match_patch()
dmp.Diff_Timeout = 0.0
diff = dmp.diff_main(text1, text2, False)
# similarity
common_text = sum([len(txt) for op, txt in diff if op == 0])
text_length = max(len(text1), len(text2))
sim = common_text / text_length
return sim, diff