-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTextProcessor.py
More file actions
36 lines (27 loc) · 1.18 KB
/
TextProcessor.py
File metadata and controls
36 lines (27 loc) · 1.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
def clean_text(content):
content=re.sub(r' {2,}', '\n', content)
return re.sub(r' {2,}', ' ', content)
def chunk_text(content):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=100,
length_function=len,
add_start_index=True,
)
documents = [Document(page_content=content)]
chunks = text_splitter.split_documents(documents)
return chunks
def compute_embeddings(documents):
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_function = HuggingFaceEmbeddings(model_name=model_name)
return [embedding_function.embed_query(doc.page_content) for doc in documents]
def similarity_search(query_embedding, document_embeddings, documents, k=3):
similarities = cosine_similarity([query_embedding], document_embeddings)[0]
ranked_indices = similarities.argsort()[::-1][:k]
results = [(documents[i], similarities[i]) for i in ranked_indices]
return results