Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement/various filetypes #69

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c84208d
FEATURE: improvment in text file classification as well as others
Feb 21, 2023
a40a901
FEATURE: various new detections
Jul 10, 2023
358ac66
FIX: remove duplicate image detection
Jul 10, 2023
ede016a
FEATURE: detect steam files
Jul 10, 2023
9c9c731
FEATURE: test file for utf-8 encoding
Jul 10, 2023
5ef56b5
FIX: change file for privacy reasons
Jul 10, 2023
51a453e
MISC: rename test
Jul 10, 2023
eecac86
Merge branch 'master' into enhancement/various_filetypes
psrok1 Sep 7, 2023
f3f1605
Merge branch 'master' into enhancement/various_filetypes
psrok1 Sep 7, 2023
28c5e05
Fixed lint errors
psrok1 Sep 7, 2023
7a2edbb
Fix: wrong image extension when classification is not based on magic …
psrok1 Sep 7, 2023
ba48fc0
Fix apk classification (#72)
nazywam Oct 2, 2023
b136356
Add a useful log (#73)
nazywam Oct 2, 2023
b0d88ca
merge upstream, remove some too specific detections
Oct 10, 2023
f259186
FEATURE: various new detections
Oct 10, 2023
c8a8b75
Merge branch 'CERT-Polska:master' into enhancement/various_filetypes
r1d3th3wav3s Oct 10, 2023
857c230
additional unit-tests
Oct 10, 2023
d7488ef
remove some to specific stuff
Oct 10, 2023
5030962
remove duplicate code from merge
Oct 10, 2023
3c69e31
remove to specific detection
Oct 10, 2023
ac55da8
fix from merge
Oct 10, 2023
7639586
retrun sample_clss, otherwise unit-tests fail
Oct 10, 2023
f494da8
Update karton/classifier/classifier.py
nazywam Nov 9, 2023
29c6157
Apply suggestions from code review
nazywam Nov 9, 2023
1de356f
Apply suggestions from code review
nazywam Nov 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 135 additions & 59 deletions karton/classifier/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,17 +139,17 @@ def process(self, task: Task) -> None:
sample_classes += self._classify_yara(task)

filemagic_classification = self._classify_filemagic(task)
if filemagic_classification:
if filemagic_classification["kind"] is not None:
sample_classes.append(filemagic_classification)

file_name = sample.name or "sample"

if not sample_classes:
self.log.info(
"Sample {!r} not recognized (unsupported type)".format(
file_name.encode("utf8")
)
"Sample {} (sha256: {}) not recognized (unsupported type)".format(
file_name, sample.sha256)
nazywam marked this conversation as resolved.
Show resolved Hide resolved
)

res = task.derive_task(
{
"type": "sample",
Expand Down Expand Up @@ -213,9 +213,12 @@ def _get_extension(self, name: str) -> str:
splitted = name.rsplit(".", 1)
return splitted[-1].lower() if len(splitted) > 1 else ""

def _classify_filemagic(self, task: Task) -> Optional[Dict[str, Optional[str]]]:
def _classify_filemagic(self, task: Task) -> Dict[str, Optional[str]]:
sample = task.get_resource("sample")
content = cast(bytes, sample.content)
file_name = sample.name
if not sample.content:
self.log.info("Sample: {} has no content".format(file_name))

magic = task.get_payload("magic") or ""
magic_mime = task.get_payload("mime") or ""
Expand Down Expand Up @@ -592,6 +595,67 @@ def apply_archive_headers(extension):
sample_class.update({"kind": "archive", "extension": extension})
return sample_class

# PGP
if magic.startswith("PGP") or magic.startswith("OpenPGP"):
sample_class.update(
{
"kind": "pgp",
}
)
return sample_class

# PCAP
if magic.startswith(("pcap capture file", "tcpdump capture file")):
sample_class.update(
{
"kind": "pcap",
}
)
return sample_class

if magic.startswith("pcap") and "ng capture file" in magic:
sample_class.update(
{
"kind": "pcapng",
}
)
return sample_class

# Images
if magic.startswith("JPEG"):
sample_class.update(
{
"kind": "jpeg",
}
)
return sample_class

if magic.startswith("PNG"):
sample_class.update(
{
"kind": "png",
}
)
return sample_class

# Wallets
if content.startswith(b"\xbaWALLET"):
sample_class.update(
{
"kind": "armory_wallet",
nazywam marked this conversation as resolved.
Show resolved Hide resolved
}
)
return sample_class

# IOT / OT
if content.startswith(b"SECO"):
sample_class.update(
{
"kind": "seco",
}
)
return sample_class

# HTML
if magic.startswith("HTML document"):
sample_class.update({"kind": "html"})
Expand All @@ -616,7 +680,12 @@ def apply_archive_headers(extension):
return sample_class

# Content heuristics
partial = content[:2048] + content[-2048:]
if len(content) >= 4096:
# take only the first and last 2048 bytes from the content
partial = content[:2048] + content[-2048:]
else:
# take the whole content
partial = content

# Dumped PE file heuristics (PE not recognized by libmagic)
if b".text" in partial and b"This program cannot be run" in partial:
Expand All @@ -639,12 +708,70 @@ def apply_archive_headers(extension):
)
return sample_class

# Telegram
if partial.startswith(b"TDF$"):
sample_class.update(
{
"kind": "telegram_desktop_file",
nazywam marked this conversation as resolved.
Show resolved Hide resolved
}
)
return sample_class

if partial.startswith(b"TDEF"):
sample_class.update(
{
"kind": "telegram_desktop_encrypted_file",
nazywam marked this conversation as resolved.
Show resolved Hide resolved
}
)
return sample_class

#
# Detection of text-files: As these files also could be scripts, do not
# immediately return sample_class after a successful detection. Like this
# heuristics part further below can override detection
#
nazywam marked this conversation as resolved.
Show resolved Hide resolved

# magic samples of ASCII files:
# XML 1.0 document, ASCII text
# XML 1.0 document, ASCII text, with very long lines (581), with CRLF line terminators
nazywam marked this conversation as resolved.
Show resolved Hide resolved
# Non-ISO extended-ASCII text, with no line terminators
# troff or preprocessor input, ASCII text, with CRLF line terminators
if "ASCII" in magic:
sample_class.update(
{
"kind": "ascii",
}
)

if magic.startswith("CSV text"):
sample_class.update(
{
"kind": "csv",
}
)

if magic.startswith("ISO-8859"):
sample_class.update(
{
"kind": "iso-8859-1",
}
)

# magic samples of UTF-8 files:
# Unicode text, UTF-8 text, with CRLF line terminators
# XML 1.0 document, Unicode text, UTF-8 text
if "UTF-8" in magic:
sample_class.update(
{
"kind": "utf-8",
}
)

# Heuristics for scripts
try:
partial_str = partial.decode(chardet.detect(partial)["encoding"]).lower()
except Exception:
self.log.warning("Heuristics disabled - unknown encoding")
partial_str = None

if partial_str:
vbs_keywords = [
Expand Down Expand Up @@ -710,59 +837,8 @@ def apply_archive_headers(extension):
)
return sample_class

# magic of XML files: XML 1.0 document, ASCII text
if magic.startswith("ASCII") or magic.endswith("ASCII text"):
sample_class.update(
{
"kind": "ascii",
}
)
return sample_class
if magic.startswith("CSV text"):
sample_class.update(
{
"kind": "csv",
}
)
return sample_class
if magic.startswith("ISO-8859"):
sample_class.update(
{
"kind": "iso-8859-1",
}
)
return sample_class
if magic.startswith("UTF-8"):
sample_class.update(
{
"kind": "utf-8",
}
)
return sample_class
if magic.startswith("PGP"):
sample_class.update(
{
"kind": "pgp",
}
)
return sample_class
if magic.startswith(("pcap capture file", "tcpdump capture file")):
sample_class.update(
{
"kind": "pcap",
}
)
return sample_class
if magic.startswith("pcap") and "ng capture file" in magic:
sample_class.update(
{
"kind": "pcapng",
}
)
return sample_class

# If not recognized then unsupported
return None
return sample_class

def _classify_yara(self, task: Task) -> List[Dict[str, Optional[str]]]:
sample = task.get_resource("sample")
Expand Down
88 changes: 88 additions & 0 deletions tests/test_classifier_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,91 @@ def test_process_misc_png(self):
},
)
self.assertTasksEqual(res, [expected])

def test_misc_utf8(self):
resource = mock_resource("misc.utf-8")
magic = self.magic_from_content(resource.content, mime=False)
res = self.run_task(mock_task(resource))

expected = Task(
headers={
"type": "sample",
"stage": "recognized",
"origin": "karton.classifier",
"quality": "high",
"kind": "utf-8",
"mime": 'text/plain',
},
payload={
"sample": resource,
"tags": ["misc:utf-8"],
"magic": magic,
},
)
self.assertTasksEqual(res, [expected])

def test_misc_pcapng(self):
resource = mock_resource("misc.pcapng")
magic = self.magic_from_content(resource.content, mime=False)
res = self.run_task(mock_task(resource))

expected = Task(
headers={
"type": "sample",
"stage": "recognized",
"origin": "karton.classifier",
"quality": "high",
"kind": "pcapng",
"mime": 'application/octet-stream',
},
payload={
"sample": resource,
"tags": ["misc:pcapng"],
"magic": magic,
},
)
self.assertTasksEqual(res, [expected])

def test_misc_pcap(self):
resource = mock_resource("misc.pcap")
magic = self.magic_from_content(resource.content, mime=False)
res = self.run_task(mock_task(resource))

expected = Task(
headers={
"type": "sample",
"stage": "recognized",
"origin": "karton.classifier",
"quality": "high",
"kind": "pcap",
"mime": 'application/vnd.tcpdump.pcap',
},
payload={
"sample": resource,
"tags": ["misc:pcap"],
"magic": magic,
},
)
self.assertTasksEqual(res, [expected])

def test_misc_pgp(self):
resource = mock_resource("misc.pgp")
magic = self.magic_from_content(resource.content, mime=False)
res = self.run_task(mock_task(resource))

expected = Task(
headers={
"type": "sample",
"stage": "recognized",
"origin": "karton.classifier",
"quality": "high",
"kind": "pgp",
"mime": 'application/octet-stream',
},
payload={
"sample": resource,
"tags": ["misc:pgp"],
"magic": magic,
},
)
self.assertTasksEqual(res, [expected])
Binary file modified tests/testdata/document.doc
Binary file not shown.
Binary file added tests/testdata/misc.pcap
Binary file not shown.
Binary file added tests/testdata/misc.pcapng
Binary file not shown.
Binary file added tests/testdata/misc.pgp
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/testdata/misc.utf-8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
some special characters:
姓名
Loading