Skip to content

Commit

Permalink
Adjust code for consistency across the codebase #181
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Druez <tdruez@nexb.com>
  • Loading branch information
tdruez committed Aug 3, 2021
1 parent 2220809 commit a48eb4c
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 77 deletions.
2 changes: 1 addition & 1 deletion scanpipe/pipelines/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def extract_layers(self):

def find_images_os_and_distro(self):
"""
Find the operating system and distro of the images.
Finds the operating system and distro of input images.
"""
for image in self.images:
image.get_and_set_distro()
Expand Down
9 changes: 4 additions & 5 deletions scanpipe/pipelines/windows_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

class WindowsDocker(Docker):
"""
A pipeline to analyze a Windows Docker image.
A pipeline to analyze Windows Docker images.
"""

@classmethod
Expand All @@ -53,14 +53,13 @@ def steps(cls):

def tag_known_software_packages(self):
"""
Flag files from well-known software packages by checking common install
paths
Flag files from well-known software packages by checking common install paths.
"""
windows.tag_known_software(self.project)

def tag_uninteresting_codebase_resources(self):
"""
Flag files that are known to be uninteresting
Flag files that are known to be uninteresting.
"""
docker.tag_whiteout_codebase_resources(self.project)
windows.tag_uninteresting_windows_codebase_resources(self.project)
Expand All @@ -70,7 +69,7 @@ def tag_uninteresting_codebase_resources(self):
def tag_program_files_dirs_as_packages(self):
"""
Report the immediate subdirectories of `Program Files` and `Program
Files (x86)` as packages
Files (x86)` as packages.
"""
windows.tag_program_files(self.project)

Expand Down
27 changes: 17 additions & 10 deletions scanpipe/pipes/rootfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,18 +351,19 @@ def tag_ignorable_codebase_resources(project):
for pattern in default_ignores.keys():
# Translate glob pattern to regex
translated_pattern = fnmatch.translate(pattern)
# postgresql does not like parts of Python regex
# PostgreSQL does not like parts of Python regex
if translated_pattern.startswith("(?s"):
translated_pattern = translated_pattern.replace("(?s", "(?")
lookups |= Q(rootfs_path__icontains=pattern)
lookups |= Q(rootfs_path__iregex=translated_pattern)

qs = project.codebaseresources.no_status()
qs.filter(lookups).update(status="ignored-default-ignores")


def tag_data_files_with_no_clues(project):
"""
Tag CodebaseResources that have a file type of `data` and no detected clues
Tags CodebaseResources that have a file type of `data` and no detected clues
to be uninteresting.
"""
lookup = Q(
Expand All @@ -375,15 +376,18 @@ def tag_data_files_with_no_clues(project):
emails=[],
urls=[],
)
project.codebaseresources.filter(lookup).update(status="ignored-data-file-no-clues")

qs = project.codebaseresources
qs.filter(lookup).update(status="ignored-data-file-no-clues")


def tag_media_files_as_uninteresting(project):
"""
Tag CodebaseResources that are media files to be uninteresting.
Tags CodebaseResources that are media files to be uninteresting.
`mimes` and `types` are taken from TypeCode:
https://github.com/nexB/typecode/blob/main/src/typecode/contenttype.py#L528
"""
# `mimes` and `types` were taken from TypeCode
# https://github.com/nexB/typecode/blob/c38f6831c59acae02a34a1288b9ce16e2e1f1733/src/typecode/contenttype.py#L528
mimes = (
"image",
"picture",
Expand All @@ -392,6 +396,7 @@ def tag_media_files_as_uninteresting(project):
"graphic",
"sound",
)

types = (
"image data",
"graphics image",
Expand All @@ -417,10 +422,12 @@ def tag_media_files_as_uninteresting(project):
"image data",
"netpbm",
)

lookup = Q()
for m in mimes:
lookup |= Q(mime_type__icontains=m)
for t in types:
lookup |= Q(file_type__icontains=t)
for mime_type in mimes:
lookup |= Q(mime_type__icontains=mime_type)
for file_type in types:
lookup |= Q(file_type__icontains=file_type)

qs = project.codebaseresources.no_status()
qs.filter(lookup).update(status="ignored-media-file")
113 changes: 53 additions & 60 deletions scanpipe/pipes/windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,57 +98,38 @@ def tag_installed_package_files(project, root_dir_pattern, package, q_objects=[]
For all CodebaseResources from `project` whose `rootfs_path` starts with
`root_dir_pattern`, add `package` to the discovered_packages of each
CodebaseResource and set the status.
If there are Q() objects in `q_objects`, then those Q() objects are chained
to the initial query (`lookup`) using AND to allow a more specific query for
package files.
"""
qs = project.codebaseresources.no_status()
lookup = Q(rootfs_path__startswith=root_dir_pattern)

# If there are Q() objects in `q_objects`, then those Q() objects are chained
# to the initial query `lookup` using AND to allow a more specific query for
# package files.
for q_object in q_objects:
lookup &= q_object

installed_package_files = qs.filter(lookup)
# If we find files whose names start with `root_dir_pattern`, we consider
# these files to be part of the Package `package` and tag these files as
# such
# these files to be part of the Package `package` and tag these files as such.
if installed_package_files:
created_package = pipes.update_or_create_package(
project=project, package_data=package.to_dict()
)
created_package = pipes.update_or_create_package(project, package.to_dict())
for installed_package_file in installed_package_files:
installed_package_file.discovered_packages.add(created_package)
installed_package_file.status = "installed-package"
installed_package_file.save()
created_package.save()


def tag_known_software(project):
"""
Find Windows software in `project` by checking `project`s CodebaseResources
to see if their rootfs_path is is under a known software root directory. If
there are CodebaseResources that are under a known software root directory,
a DiscoveredPackage is created for that software package and all files under
that software package's root directory are considered installed files for
that package.
Currently, we are only checking for Python and openjdk in Windows Docker
image layers.
If a version number cannot be determined for an installed software Package,
then a version number of "nv" will be set.
"""
def _tag_python_software(project):
qs = project.codebaseresources.no_status()
python_root_directory_name_pattern = r"(^/(Files/)?Python(\d+)?)/.*$"
python_root_directory_name_pattern_compiled = re.compile(
python_root_directory_name_pattern
)
python_root_pattern = r"(^/(Files/)?Python(\d+)?)/.*$"
python_root_pattern_compiled = re.compile(python_root_pattern)

python_versions_by_path = {}
for python_codebase_resource in qs.filter(
rootfs_path__regex=python_root_directory_name_pattern
):
for python_resource in qs.filter(rootfs_path__regex=python_root_pattern):
_, python_root_dir, _, version, _ = re.split(
python_root_directory_name_pattern_compiled,
python_codebase_resource.rootfs_path,
python_root_pattern_compiled,
python_resource.rootfs_path,
)
if python_root_dir in python_versions_by_path:
continue
Expand Down Expand Up @@ -177,19 +158,16 @@ def tag_known_software(project):
q_objects=q_objects,
)


def _tag_openjdk_software(project):
qs = project.codebaseresources.no_status()
openjdk_root_directory_name_pattern = (
r"^(/(Files/)?(open)?jdk(-((\d*)(\.\d+)*))*)/.*$"
)
openjdk_root_directory_name_pattern_compiled = re.compile(
openjdk_root_directory_name_pattern
)
openjdk_root_pattern = r"^(/(Files/)?(open)?jdk(-((\d*)(\.\d+)*))*)/.*$"
openjdk_root_pattern_compiled = re.compile(openjdk_root_pattern)

openjdk_versions_by_path = {}
for openjdk_codebase_resource in qs.filter(
rootfs_path__regex=openjdk_root_directory_name_pattern
):
for openjdk_codebase_resource in qs.filter(rootfs_path__regex=openjdk_root_pattern):
_, openjdk_root_path, _, _, _, openjdk_version, _, _, _ = re.split(
openjdk_root_directory_name_pattern_compiled,
openjdk_root_pattern_compiled,
openjdk_codebase_resource.rootfs_path,
)
if openjdk_root_path in openjdk_versions_by_path:
Expand All @@ -207,10 +185,31 @@ def tag_known_software(project):
homepage_url="http://openjdk.java.net/",
)
tag_installed_package_files(
project=project, root_dir_pattern=openjdk_path, package=openjdk_package
project=project,
root_dir_pattern=openjdk_path,
package=openjdk_package,
)


def tag_known_software(project):
"""
Find Windows software in `project` by checking `project`s CodebaseResources
to see if their rootfs_path is is under a known software root directory. If
there are CodebaseResources that are under a known software root directory,
a DiscoveredPackage is created for that software package and all files under
that software package's root directory are considered installed files for
that package.
Currently, we are only checking for Python and openjdk in Windows Docker
image layers.
If a version number cannot be determined for an installed software Package,
then a version number of "nv" will be set.
"""
_tag_python_software(project)
_tag_openjdk_software(project)


PROGRAM_FILES_DIRS_TO_IGNORE = (
"Common Files",
"Microsoft",
Expand All @@ -219,23 +218,21 @@ def tag_known_software(project):

def tag_program_files(project):
"""
Report all subdirectories of Program Files and Program Files (x86) as
Packages
Report all subdirectories of Program Files and Program Files (x86) as Packages.
If a Package is detected in this manner, then we will attempt to determine
the version from the path. If a version cannot be determined, a version of
`nv` will be set for the Package.
"""
qs = project.codebaseresources.no_status()
# Get all files from Program Files and Program Files (x86)
program_files_one_directory_below_pattern = r"(^.*Program Files( \(x86\))?/([^/]+))"
program_files_one_directory_below_pattern_compiled = re.compile(
program_files_one_directory_below_pattern
)
program_files_subdir_pattern = r"(^.*Program Files( \(x86\))?/([^/]+))"
program_files_subdir_pattern_compiled = re.compile(program_files_subdir_pattern)

program_files_dirname_by_path = {}
for program_file in qs.filter(rootfs_path__regex=r"^.*/Program Files( \(x86\))?"):
_, program_files_subdir, _, dirname, _ = re.split(
program_files_one_directory_below_pattern_compiled, program_file.rootfs_path
program_files_subdir_pattern_compiled, program_file.rootfs_path
)
if (
program_files_subdir in program_files_dirname_by_path
Expand All @@ -244,14 +241,10 @@ def tag_program_files(project):
continue
program_files_dirname_by_path[program_files_subdir] = dirname

for (
program_root_dir,
program_root_dir_name,
) in program_files_dirname_by_path.items():
package = win_reg.InstalledWindowsProgram(
name=program_root_dir_name,
version="nv",
)
for root_dir, root_dir_name in program_files_dirname_by_path.items():
package = win_reg.InstalledWindowsProgram(name=root_dir_name, version="nv")
tag_installed_package_files(
project=project, root_dir_pattern=program_root_dir, package=package
project=project,
root_dir_pattern=root_dir,
package=package,
)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@
],
"scancodeio_pipelines": [
"docker = scanpipe.pipelines.docker:Docker",
"windows_docker = scanpipe.pipelines.windows_docker:WindowsDocker",
"load_inventory = scanpipe.pipelines.load_inventory:LoadInventory",
"root_filesystems = scanpipe.pipelines.root_filesystems:RootFS",
"scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase",
"scan_package = scanpipe.pipelines.scan_package:ScanPackage",
"windows_docker = scanpipe.pipelines.windows_docker:WindowsDocker",
],
},
classifiers=[
Expand Down

0 comments on commit a48eb4c

Please sign in to comment.