binareio · narcissus1394 · Feb 15, 2024 · Feb 15, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/.gitignore b/.gitignore
diff --git a/src/common/models/models.py b/src/common/models/models.py
@@ -93,6 +93,10 @@ class SearchOptions(BaseModel):
     days: Optional[int] = Field(default=None, description="Number of days back when the CVEs were last modified", alias="days-back", ge=0)
     deprecated: Optional[bool] = Field(default=False, description="If set to true, will fetch only the deprecated CPE names", alias="deprecated")
     profile: Optional[bool] = Field(default=None, description="Would also run the profile execution of the search and save the results in a file")
+    epssScoreGt: Optional[float] = Field(default=None, description="Filter by EPSS score greater than", alias="epss-score-gt", gt=0)   # New field for EPSS score greater than
+    epssScoreLt: Optional[float] = Field(default=None, description="Filter by EPSS score less than", alias="epss-score-lt", gt=0)  # New field for EPSS score less than
+    epssPercGt: Optional[float] = Field(default=None, description="Filter by EPSS percentile greater than", alias="epss-perc-gt", ge=0, le=1)  # New field for EPSS percentile greater than
+    epssPercLt: Optional[float] = Field(default=None, description="Filter by EPSS percentile less than", alias="epss-perc-lt", ge=0, le=1) # New field for EPSS percentile less than
     output: OutputType = Field(default=OutputType.json, description="Define the output format")
 
     class Config:

diff --git a/src/common/search.py b/src/common/search.py
@@ -12,7 +12,7 @@
 import re
 import json
 from typing import List, Iterator
-from sqlalchemy import Boolean
+from sqlalchemy import Boolean,  cast, Numeric
 from sqlalchemy.sql import text, expression
 from sqlalchemy.orm import aliased
 from generic import ApplicationContext
@@ -40,6 +40,17 @@ def search_cves(appctx: ApplicationContext, opts: SearchOptions):
 
         # prepare the search query
         query = session.query(cve_table)
+        # Filter by EPSS score
+        if opts.epssScoreGt is not None:
+            query = query.filter(cast(cve_table.data['metrics']['epss']['score'].astext, Numeric) > opts.epssScoreGt)
+        if opts.epssScoreLt is not None:
+            query = query.filter(cast(cve_table.data['metrics']['epss']['score'].astext, Numeric) < opts.epssScoreLt)
+
+        # Filter by EPSS percentile
+        if opts.epssPercGt is not None:
+            query = query.filter(cast(cve_table.data['metrics']['epss']['percentile'].astext, Numeric) > opts.epssPercGt)
+        if opts.epssPercLt is not None:
+            query = query.filter(cast(cve_table.data['metrics']['epss']['percentile'].astext, Numeric) < opts.epssPercLt)
 
         # filter by the cve IDS, either directly specified in the search options
         if opts.cveId:

diff --git a/src/config/setenv/config.ini b/src/config/setenv/config.ini
@@ -41,7 +41,7 @@ file.max.count = 10
 [fetch]
 
 ; NIST CVE API
-url.cve = https://services.nvd.nist.gov/rest/json/cves/2.0 
+url.cve = https://services.nvd.nist.gov/rest/json/cves/2.0
 
 ; NIST CPE API
 url.cpe = https://services.nvd.nist.gov/rest/json/cpes/2.0
@@ -55,6 +55,9 @@ url.cwe = https://cwe.mitre.org/data/xml/views/2000.xml.zip
 ; MITRE CAPEC source file
 url.capec = https://capec.mitre.org/data/xml/views/3000.xml.zip
 
+; Cynetia EPSS source file
+url.epss = https://epss.cyentia.com
+
 ; API_KEY set the value of API key obtained from NVD
 api_key = ${NVD_API_KEY}
 

diff --git a/src/load b/src/load
@@ -36,15 +36,14 @@ from common.util import init_db_schema
 SPLIT_BY_COLUMN = re.compile(r'(?<!\\):')
 BATCH_SIZE = 256
 
-
 class ValidationError(Exception): ...
 class StatusError(Exception): ...
 
 http_retry_strategy = Retry(
     total=10,
     backoff_factor=0.5,
     status_forcelist=[403, 503],
-    method_whitelist=["GET"]
+    method_whitelist = ['GET'],
 )
 
 # ------------------------------------------------------------------------------
@@ -134,6 +133,7 @@ def save_cve_data(appctx, data) -> dict:
     db_records = []
     db_records_cpes = []
     if key in data and data[key]:
+
         for cve in data[key]:
             cve = cve['cve']
 
@@ -203,14 +203,28 @@ def fetch_data_feed(appctx, data_name, args):
 
     import zipfile
     import io
+    import gzip
+    import csv
+    from datetime import date
 
     fetch_data_info = fetch_status(appctx, data_name, args)
+
     if fetch_data_info and not args.full:
-        print(f"{data_name} data is already present. Loaded on {fetch_data_info['last_modified_date']}: {fetch_data_info['stats']['total_records']} records")
-        return
+        if data_name == 'epss' and fetch_data_info['last_modified_date'].date() != date.today():
+            pass
+        else:
+            print(f"{data_name} data is already present. Loaded on {fetch_data_info['last_modified_date']}: {fetch_data_info['stats']['total_records']} records")
+            return
 
     # download the file
     data_url = appctx.config.get_param(f'fetch.url.{data_name}', None)
+
+    if data_name == 'epss':
+        today = datetime.now().astimezone(pytz.timezone('UTC'))
+        previous_day = today - timedelta(days=1)
+        # download the EPSS data from https://epss.cyentia.com/epss_scores-2023-12-31.csv.gz using the previous day from the current date.
+        data_url = f"{data_url}/epss_scores-{previous_day.strftime('%Y-%m-%d')}.csv.gz"
+
     if not data_url: raise ValidationError(f'{data_name} url config param not specified')
 
     rest_session = requests.Session()
@@ -245,19 +259,24 @@ def fetch_data_feed(appctx, data_name, args):
         print(f"Request failed: {e}")
         return
 
-    # extract the content from the zip (load the xml)
+    # extract the content from the zip (load the xml or csv)
     file_contents = []
     if content:
-        with zipfile.ZipFile(content) as zip_ref:
-            file_contents = [zip_ref.read(file) for file in zip_ref.namelist()][0:1]
+        if data_name == 'epss':
+            epss_response = requests.get(data_url, stream=True)
+            epss_response.raise_for_status()
+        else:
+            with zipfile.ZipFile(content) as zip_ref:
+                file_contents = [zip_ref.read(file) for file in zip_ref.namelist()][0:1]
 
     # convert to json
-    data_json = None
-    if not file_contents:
-        raise RuntimeError('Could not extract anything from the zip content')
-    else:
-        import xmltodict
-        data_json = xmltodict.parse(file_contents[0], force_list=('xhtml:p',), attr_prefix='')
+    if data_name != 'epss':
+        data_json = None
+        if not file_contents:
+            raise RuntimeError('Could not extract anything from the zip content')
+        else:
+            import xmltodict
+            data_json = xmltodict.parse(file_contents[0], force_list=('xhtml:p',), attr_prefix='')
 
     # process/clean the CWE json data
     def get_cwe_data(item):
@@ -284,6 +303,26 @@ def fetch_data_feed(appctx, data_name, args):
             data=item
         )
 
+    # process the EPSS data
+    def get_epss_data(item):
+        with gzip.open(item.raw, mode='rt') as csv_file:
+            csv_reader = csv.reader(csv_file)
+
+            # Skip the first row as it just contains this info: 'model_version:v2023.03.01' and 'score_date:2024-02-12T00:00:00+0000'
+            first_row = next(csv_reader)
+            # score_date also needed to be extracted
+            date_string = first_row[1].split(':')[1]
+            date_value = date_string.split('T')[0]
+
+            next(csv_reader)  # skip the next row which is the header information of csv file
+
+            epss_data = {}
+            for row in csv_reader:
+                row.append(date_value)
+                epss_data[row[0]] = row[1:4]
+
+        return epss_data, date_value
+
     if data_name == 'cwe':
 
         db_records = list(map(get_cwe_data, data_json.get('Weakness_Catalog', {}).get('Weaknesses', {}).get('Weakness', [])))
@@ -296,9 +335,13 @@ def fetch_data_feed(appctx, data_name, args):
         data_date = data_json.get('Attack_Pattern_Catalog', {}).get('Date', None)
         save_data_method = save_capec_data
 
+    elif data_name == 'epss':
+        db_records, data_date = get_epss_data(epss_response)
+        save_data_method = save_epss_data
+
     else: raise ValidationError(f'Unknown data type to save: <{data_name}>')
 
-    # load into Cwe/Capec table
+    # load into Cwe/Capec table or update Vuln table for epss
     save_data_method(appctx, args, db_records)
 
     # Update the stats about retrieved information
@@ -346,6 +389,33 @@ def save_capec_data(appctx, args, db_records):
 
         db_insert_progress.close()
 
+# ------------------------------------------------------------------------------
+# Function to save EPSS data to the database
+def save_epss_data(appctx, args, db_records):
+    from sqlalchemy.orm import load_only
+
+    with appctx.db as session:
+
+        db_insert_progress_name = 'Vuln db update epss'
+        bar_format = '{n_fmt}/{total} {l_bar}{bar}| ({elapsed}/{remaining})'
+        db_insert_progress = tqdm(total=len(db_records), bar_format=f'{db_insert_progress_name:<20} {bar_format}',ascii=True)
+
+        BATCH_SIZE_epss = 1024
+        for batch in [list(db_records.keys())[i:i + BATCH_SIZE_epss] for i in range(0, len(db_records), BATCH_SIZE_epss)]:
+
+            vuln_query= session.query(Vuln).filter(Vuln.vuln_id.in_(batch)).options(load_only(Vuln.data)).all()
+            update_values = []
+
+            for vuln_record in vuln_query:
+                # Find the corresponding record in the batch
+                epss_desired_row = db_records[vuln_record.vuln_id]
+                vuln_record.data['metrics']['epss']= {'score': epss_desired_row[0], 'percentile': epss_desired_row[1], 'date': epss_desired_row[2]}
+                update_values.append({'id': vuln_record.id, 'data': vuln_record.data})
+
+            session.bulk_update_mappings(Vuln, update_values)
+            db_insert_progress.update(len(batch))
+
+        db_insert_progress.close()
 
 # ------------------------------------------------------------------------------
 def fetch_data(appctx, data_name, args):
@@ -356,7 +426,7 @@ def fetch_data(appctx, data_name, args):
         method = save_cve_data
     elif data_name == 'cpe':
         method = save_cpe_data
-    elif data_name in ('cwe', 'capec'):
+    elif data_name in ('cwe', 'capec', 'epss'):
         fetch_data_feed(appctx, data_name, args)
         return
     else:
@@ -522,7 +592,7 @@ def fetch_status(appctx, name, args, data=None):
         # then we need to save/update the data in the fetch status table
         if data:
 
-            if name == 'cve':      table_name = Vuln
+            if name == 'cve' or name == 'epss':      table_name = Vuln
             elif name == 'cpe':    table_name = Cpe
             elif name == 'cwe':    table_name = Cwe
             elif name == 'capec':  table_name = Capec
@@ -579,7 +649,7 @@ def main():
     # --------------------------------------------------------------------------
     # Parse the arguments and Validate
     parser = argparse.ArgumentParser(description="FastCVE database Loader")
-    parser.add_argument('-d', '--data', dest='data', nargs='+', required=True, choices=['cpe', 'cve', 'cwe', 'capec'], help='Specify data to be loaded')
+    parser.add_argument('-d', '--data', dest='data', nargs='+', required=True, choices=['cpe', 'cve', 'cwe', 'capec', 'epss'], help='Specify data to be loaded')
     parser.add_argument('--full', dest='full', action='store_true', help='Will consider to fetch all data again')
     parser.add_argument('--drop', dest='drop', action='store_true', help='This will drop first existing data')
     parser.add_argument('-f', '--from', dest='from_date', action='store', help='From Date YYYY-MM-DD["T"HH:MI:SS]')
@@ -607,7 +677,12 @@ def main():
 
     validate_opts(args_dict)
 
-    # now we need to fetch the data from NVD/NIST/MITRE using the API and populate the DB
+    # if epss comes before cve (load epss cve), first remove epss and add epss at end (epss must be loaded after cve)
+    if 'cve' in args.data and 'epss' in args.data:
+        args.data.remove('epss')
+        args.data.append('epss')
+
+    # now we need to fetch the data from NVD/NIST/MITRE/Cyentia using the API and populate the DB
     for data in args.data:
         fetch_data(appctx, data, args)