support/scripts: use FKIE git tree

Currently, we grab the per-year CVE feeds, in two passes: first, we grab
the meta files, and check whether something has changed since last we
downloaded it; second, we download the feed proper, unless the meta file
has not changed, in which case we use the locally cached feed.

However, it has appeared that the FKIE releases no longer provide the
meta files, which means that (once again), our daily reports are broken.

The obvious fix would be to drop the use of the meta file, and always
and unconditionally download the feeds. That's relatively trivial to do,
but the feeds are relatively big (even as xz-xompressed).

However, the CVE database from FKIE is available as a git tree. Git is
pretty good at only sending delta when updating a local copy. In
addition, the git tree, contains each CVE as an individual file, so it
is relatively easier to scan and parse.

Switch to using a local git clone.

Slightly surprisingly (but not so much either), parsing the CVE files is
much faster when using the git working copy, than it is when parsing the
per-year feeds: indeed, the per-year feeds are xz-compressed, and even
if python is slow-ish to scan a directory and opening files therein, it
is still much faster than to decompress xz files. The timing delta [0]
is ~100s before and ~10s now, about a ten time improvement, over the
whole package set.

The drawback, however, is that the git tree is much bigger on-disk, from
~55MiB for the per-year compressed feeds, to 2.1GiB for the git tree
(~366MiB) and a working copy (~1.8GiB)... Given very few people are
going to use that, that's considered acceptable...

Eventually, with a bit of hacking [1], the two pkg-stats, before and
after this change, yield the same data (except for the date and commit
hash).

[0] hacking support/scripts/pkg-stats to display the time before/after
the CVE scan, and hacking support/scripts/cve.py to do no download so
that only the CVE scan happens (and also because the meta files are no
longer available).

[1] sorting the CVE lists in json, sorting the json keys, and using the
commit from the FKIE git tree that was used for the current per-year
feeds.

Signed-off-by: Yann E. MORIN <yann.morin.1998@free.fr>
Cc: Arnout Vandecappelle (Essensium/Mind) <arnout@mind.be>
Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Signed-off-by: Arnout Vandecappelle <arnout@mind.be>
(cherry picked from commit fee7efafd0)
Signed-off-by: Peter Korsgaard <peter@korsgaard.com>
This commit is contained in:
Yann E. MORIN 2024-03-18 23:04:20 +01:00 committed by Peter Korsgaard
parent 7cd939424d
commit 00b9f2a2f2

View File

@ -19,29 +19,16 @@
import datetime import datetime
import os import os
import requests # URL checking
import distutils.version import distutils.version
import lzma import json
import time import subprocess
import sys import sys
import operator import operator
try:
import ijson
# backend is a module in < 2.5, a string in >= 2.5
if 'python' in getattr(ijson.backend, '__name__', ijson.backend):
try:
import ijson.backends.yajl2_cffi as ijson
except ImportError:
sys.stderr.write('Warning: Using slow ijson python backend\n')
except ImportError:
sys.stderr.write("You need ijson to parse NVD for CVE check\n")
exit(1)
sys.path.append('utils/') sys.path.append('utils/')
NVD_START_YEAR = 1999 NVD_START_YEAR = 1999
NVD_BASE_URL = "https://github.com/fkie-cad/nvd-json-data-feeds/releases/latest/download" NVD_BASE_URL = "https://github.com/fkie-cad/nvd-json-data-feeds/"
ops = { ops = {
'>=': operator.ge, '>=': operator.ge,
@ -81,41 +68,24 @@ class CVE:
self.nvd_cve = nvd_cve self.nvd_cve = nvd_cve
@staticmethod @staticmethod
def download_nvd_year(nvd_path, year): def download_nvd(nvd_git_dir):
metaf = "CVE-%s.meta" % year print(f"Updating from {NVD_BASE_URL}")
path_metaf = os.path.join(nvd_path, metaf) if os.path.exists(nvd_git_dir):
jsonf_xz = "CVE-%s.json.xz" % year subprocess.check_call(
path_jsonf_xz = os.path.join(nvd_path, jsonf_xz) ["git", "pull"],
cwd=nvd_git_dir,
# If the database file is less than a day old, we assume the NVD data stdout=subprocess.DEVNULL,
# locally available is recent enough. stderr=subprocess.DEVNULL,
if os.path.exists(path_jsonf_xz) and os.stat(path_jsonf_xz).st_mtime >= time.time() - 86400: )
return path_jsonf_xz else:
# Create the directory and its parents; git
# If not, we download the meta file # happily clones into an empty directory.
url = "%s/%s" % (NVD_BASE_URL, metaf) os.makedirs(nvd_git_dir)
print("Getting %s" % url) subprocess.check_call(
page_meta = requests.get(url) ["git", "clone", NVD_BASE_URL, nvd_git_dir],
page_meta.raise_for_status() stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
# If the meta file already existed, we compare the existing )
# one with the data newly downloaded. If they are different,
# we need to re-download the database.
# If the database does not exist locally, we need to redownload it in
# any case.
if os.path.exists(path_metaf) and os.path.exists(path_jsonf_xz):
meta_known = open(path_metaf, "r").read()
if page_meta.text == meta_known:
return path_jsonf_xz
# Grab the compressed JSON NVD, and write files to disk
url = "%s/%s" % (NVD_BASE_URL, jsonf_xz)
print("Getting %s" % url)
page_json = requests.get(url)
page_json.raise_for_status()
open(path_jsonf_xz, "wb").write(page_json.content)
open(path_metaf, "w").write(page_meta.text)
return path_jsonf_xz
@staticmethod @staticmethod
def sort_id(cve_ids): def sort_id(cve_ids):
@ -131,15 +101,15 @@ class CVE:
feeds since NVD_START_YEAR. If the files are missing or outdated in feeds since NVD_START_YEAR. If the files are missing or outdated in
nvd_dir, a fresh copy will be downloaded, and kept in .json.gz nvd_dir, a fresh copy will be downloaded, and kept in .json.gz
""" """
nvd_git_dir = os.path.join(nvd_dir, "git")
CVE.download_nvd(nvd_git_dir)
for year in range(NVD_START_YEAR, datetime.datetime.now().year + 1): for year in range(NVD_START_YEAR, datetime.datetime.now().year + 1):
filename = CVE.download_nvd_year(nvd_dir, year) for dirpath, _, filenames in os.walk(os.path.join(nvd_git_dir, f"CVE-{year}")):
try: for filename in filenames:
content = ijson.items(lzma.LZMAFile(filename), 'cve_items.item') if filename[-5:] != ".json":
except: # noqa: E722 continue
print("ERROR: cannot read %s. Please remove the file then rerun this script" % filename) with open(os.path.join(dirpath, filename), "rb") as f:
raise yield cls(json.load(f))
for cve in content:
yield cls(cve)
def each_product(self): def each_product(self):
"""Iterate over each product section of this cve""" """Iterate over each product section of this cve"""