-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathfunction_app.py
More file actions
98 lines (78 loc) · 2.81 KB
/
function_app.py
File metadata and controls
98 lines (78 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import azure.functions as func
import logging
import requests
import traceback
import validators
from bs4 import BeautifulSoup
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
@app.route(route="search_site", methods=["POST"])
def search_site(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
# Checks for a URL JSON property in the HTTP Request body.
url = req.params.get('url')
if not url:
try:
req_body = req.get_json()
except ValueError:
pass
else:
url = req_body.get('url')
if url:
if validators.url(url):
response = orchestrator_function(url)
return func.HttpResponse(f"{response}",
status_code=200)
else:
return func.HttpResponse(
"The URL was invalid.",
status_code=400
)
else:
return func.HttpResponse(
"No URL was passed. Please input a URL.",
status_code=400
)
# Function to orchestrate all of the function calls and store the needed data.
def orchestrator_function(url):
try:
data = crawl_site(url)
site_data = []
site_data.append(get_page_title(data))
site_data.append(get_all_urls(data))
site_data.append(get_meta_tag(data))
return site_data
except Exception as error:
logging.error(f"Error while making a request to the site: {error.__cause__}")
logging.error(traceback.format_exc())
# Submits the HTTP request to the user-inputted URL.
def crawl_site(url):
response = requests.get(url, allow_redirects=False)
return BeautifulSoup(response.text, "lxml")
# Extracts the page title.
def get_page_title(data):
try:
return data.title.string
except Exception as error:
logging.error(f"Error retrieving the site title: {error.__cause__}")
logging.error(traceback.format_exc())
# Gets all of the URLs from the webpage.
def get_all_urls(data):
try:
urls = []
url_elements = data.select("a[href]")
for url_element in url_elements:
url = url_element['href']
if "https://" in url or "http://" in url:
urls.append(url)
return urls
except Exception as error:
logging.error(f"Error retrieving the URLs in the site: {error.__cause__}")
logging.error(traceback.format_exc())
# Extracts a specific meta tag from the URL.
def get_meta_tag(data):
try:
meta_tag = data.find("meta", attrs={'name': 'description'})
return meta_tag["content"]
except Exception as error:
logging.error(f"Error retrieving the URLs in the site: {error.__cause__}")
logging.error(traceback.format_exc())