Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patched np.int, feature hashing problem, and lief errors version #110

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 42 additions & 31 deletions ember/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@
for your modeling problem.
'''

import hashlib
import json
import os
import re

import lief
import hashlib
import numpy as np
import os
import json
from sklearn.feature_extraction import FeatureHasher

LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 )
LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10)
LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)


Expand Down Expand Up @@ -97,7 +98,7 @@ def _entropy_bin_counts(self, block):
return Hbin, c

def raw_features(self, bytez, lief_binary):
output = np.zeros((16, 16), dtype=np.int)
output = np.zeros((16, 16), dtype=int)
a = np.frombuffer(bytez, dtype=np.uint8)
if a.shape[0] < self.window:
Hbin, c = self._entropy_bin_counts(a)
Expand Down Expand Up @@ -142,22 +143,24 @@ def raw_features(self, bytez, lief_binary):
return {"entry": "", "sections": []}

# properties of entry point, or if invalid, the first executable section

not_found_error_class = RuntimeError if not lief.__version__.startswith("0.9.0") else lief.not_found
try:
if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12):
section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase)

if section is None:
raise lief.not_found
raise not_found_error_class
entry_section = section.name
else: # lief < 0.12
else: # lief < 0.12
entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
except lief.not_found:
# bad entry point, let's find the first executable section
entry_section = ""
for s in lief_binary.sections:
if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
entry_section = s.name
break
except not_found_error_class:
# bad entry point, let's find the first executable section
entry_section = ""
mem_execute_characteristics = lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE if lief.__version__.startswith("0.9.0") else lief.PE.Section.CHARACTERISTICS.MEM_EXECUTE
for s in lief_binary.sections:
if mem_execute_characteristics in s.characteristics_lists:
entry_section = s.name
break

raw_obj = {"entry": entry_section}
raw_obj["sections"] = [{
Expand Down Expand Up @@ -189,7 +192,7 @@ def process_raw_features(self, raw_obj):
section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
section_vsize = [(s['name'], s['vsize']) for s in sections]
section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]

Expand Down Expand Up @@ -267,7 +270,6 @@ def raw_features(self, bytez, lief_binary):
# export is a string (LIEF 0.9.0 and earlier)
clipped_exports = [export[:10000] for export in lief_binary.exported_functions]


return clipped_exports

def process_raw_features(self, raw_obj):
Expand Down Expand Up @@ -318,7 +320,7 @@ def process_raw_features(self, raw_obj):
raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
raw_obj['symbols']
],
dtype=np.float32)
dtype=np.float32)


class HeaderFileInfo(FeatureType):
Expand Down Expand Up @@ -499,15 +501,15 @@ class PEFeatureExtractor(object):
def __init__(self, feature_version=2, print_feature_warning=True, features_file=''):
self.features = []
features = {
'ByteHistogram': ByteHistogram(),
'ByteEntropyHistogram': ByteEntropyHistogram(),
'StringExtractor': StringExtractor(),
'GeneralFileInfo': GeneralFileInfo(),
'HeaderFileInfo': HeaderFileInfo(),
'SectionInfo': SectionInfo(),
'ImportsInfo': ImportsInfo(),
'ExportsInfo': ExportsInfo()
}
'ByteHistogram': ByteHistogram(),
'ByteEntropyHistogram': ByteEntropyHistogram(),
'StringExtractor': StringExtractor(),
'GeneralFileInfo': GeneralFileInfo(),
'HeaderFileInfo': HeaderFileInfo(),
'SectionInfo': SectionInfo(),
'ImportsInfo': ImportsInfo(),
'ExportsInfo': ExportsInfo()
}

if os.path.exists(features_file):
with open(features_file, encoding='utf8') as f:
Expand All @@ -520,22 +522,31 @@ def __init__(self, feature_version=2, print_feature_warning=True, features_file=
if not lief.__version__.startswith("0.8.3"):
if print_feature_warning:
print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(
f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(f"WARNING: in the feature calculations.")
elif feature_version == 2:
self.features.append(DataDirectories())
if not lief.__version__.startswith("0.9.0"):
if print_feature_warning:
print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(
f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(f"WARNING: in the feature calculations.")
else:
raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
self.dim = sum([fe.dim for fe in self.features])

def raw_features(self, bytez):
lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound,
RuntimeError)
if lief.__version__.startswith("0.9.0"):
lief_errors = (
lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError)
else:
lief_errors = (
lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error,
lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound,
RuntimeError)

try:
lief_binary = lief.PE.parse(list(bytez))
except lief_errors as e:
Expand Down