-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproject.py
More file actions
97 lines (78 loc) · 2.64 KB
/
project.py
File metadata and controls
97 lines (78 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import json
import io
from pypdf import PdfReader
from pdf2image import convert_from_bytes
import pytesseract
from pydantic import BaseModel
from ollama import chat
class Profile(BaseModel): # Structure of information that will be output in main.py
Name: str
Email: str
Phone_Number: str
Location: str
Job: list[str]
Education: str
skills: list[str]
# in main function only file and functuon pdftojsson
# as the it has all the funciton inside // for initial testing
def main():
pdf = "RayyanAhmedCv.pdf" # name of pdf_file
pdf_to_json(pdf)
def extracted_pdf(pdf): # function used to convert pdf to text
extracted_text = ""
try:
reader = PdfReader(
io.BytesIO(pdf)
) # treat bytes as a file as requirement of FastApi UploadFile type it in main from Fastapi
for pages in reader.pages:
extracted_text += (
pages.extract_text() or ""
) # loops through every page and extracts text if no text then empty string
except Exception as e:
print("No extraction", e)
if not extracted_text.strip(): # if extract is empty in case pdf is image
images = convert_from_bytes(pdf)
for img in images:
extracted_text += pytesseract.image_to_string(
img
) # pytesseract.image_to_string is a function that reads img and converts to string
return extracted_text
def ai_model(text):
prompt = f"""
You are given resume text extracted from a PDF.
Extract the following fields strictly in JSON format:
You are an information extraction system.
You should not add any newline character
- Do NOT add explanations
- Do NOT add markdown
- Do NOT add extra text
- Text should be clean
- Name
- Email
-Phone Number
- Location
- Job (from experience)
- Education (Highest education type)
- Skills (list)
Resume text:
{text} """
# llama3 is the Ai model
response = chat(
model="llama3",
messages=[
{"role": "user", "content": prompt} # role and prompt given as content
],
format=Profile.model_json_schema(), # Generate json schema from the class / so Ai can follow
)
# as output will be ["name"]["info"] which would be in raw text/string form
# jsonloads() used for converting strings into json
raw = response["message"]["content"] #
return json.loads(raw)
def pdf_to_json(
pdf,
): # conversion, takes pdf file and outputs json file using llama3 model
text = extracted_pdf(pdf)
data = ai_model(text)
return data
if __name__ == "__main__":
main()