Skip to content

Commit

Permalink
refactor: Simplifies project structure.
Browse files Browse the repository at this point in the history
clava's structure has grown into quite a mess, mostly because most parts
were created during researching this project. This commit attempts at
restructuring major parts of clava without chaning much of the
functionality. I've removed some of the features that I never used since
early development, but they might re-appear in a more generalized form.

Next step will be the migration of the logic burried in the Jupyter
notebooks.
  • Loading branch information
strfx committed Sep 30, 2021
1 parent 3c21c82 commit aa20b28
Show file tree
Hide file tree
Showing 20 changed files with 570 additions and 602 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
* [Getting Started](#getting-started)
* [Development](#development)
* [Contribute](#contribute)
* [Resources](#resources)
* [Credits](#credits)

# About
Expand Down Expand Up @@ -84,6 +85,20 @@ Alternatively, you can run pytest against the `tests/` directory with your own s

Contributions are welcome! If you plan major changes, please create an issue first to discuss the changes.

# Resources

Good datasets are essential, however there are not many public datasets of good- and malware executables. You can assemble your own dataset using projects like:

* [VirusShare](https://virusshare.com/) offers access to large amounts of malware (registration required).
* [MalwareBazaar](https://bazaar.abuse.ch/) offers daily collections of malware: https://mb-api.abuse.ch/downloads/
* [APTMalware Github Repo](https://github.com/cyber-research/APTMalware)
* [Sysinternals Tools](TODO) are a great set to test your rules against, since Sysinternals tools often cause false positive hits.

Public goodware datasets are rare - PRs are welcome :smile:

Tools:
* [Capstone.js](https://alexaltea.github.io/capstone.js/) for interactive disassembling, useful during development.

# Credits

clava was heavily inspired by these projects:
Expand All @@ -93,3 +108,6 @@ clava was heavily inspired by these projects:
* [binsequencer](https://github.com/karttoon/binsequencer/)
* [yabin](https://github.com/AlienVault-OTX/yabin)

I would also like to thank these projects:
* [pefile](https://pypi.org/project/pefile/)
* [Capstone Disassembler](https://www.capstone-engine.org/)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
extras_require={
'dev': [
'pytest',
'pandas',
'mypy'
'mypy',
'pandas', # TODO: Remove pandas dep
]
}
)
2 changes: 1 addition & 1 deletion src/clava/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = 1.0
__version__ = "1.0.0"
147 changes: 29 additions & 118 deletions src/clava/cli.py
Original file line number Diff line number Diff line change
@@ -1,167 +1,78 @@
"""clava v1.0
Generate Code-Based Yara Rules.
"""clava v1.0: Generate code-based Yara rules.
Usage:
clava_cli.py signature <file> [--n=<sequences>] [--strategy=<strategy>]
clava_cli.py yara <file> [--n=<sequences>] [--strategy=<strategy>] [--output=<output file>]
clava_cli.py dump <file>
clava_cli.py <file> [--topk=<sequences>] [--output=<output file>]
Options:
-h --help Show this message.
--version Show version of clava.
--n=<sequences> Size of the sequences [default: 3].
--strategy=<strategy> Signature generator to use [default: logreg-tf].
--topk=<sequences> Number of instruction sequences to use [default: 3]
--output=<output file> Store the generated rule in this file.
"""
import sys
import warnings
from operator import attrgetter
from pathlib import Path
from typing import Dict, Literal

from docopt import docopt
from pefile import PEFormatError

from clava.crafting import to_yara
from clava.generators.logregtf import LogRegTF
from clava.io import disassemble, disassembleX
from clava.models import Signature
from clava.disassembling import disassemble
from clava.inference import LogRegClassifier
from clava.output import generate_yara_rule

# Suppress sklearn warnings for now (version mismatch)
warnings.filterwarnings("ignore", category=UserWarning)
NGRAM_SIZE = 6
CLASSIFIER_PATH = Path('wipro/models/simple-tf-logreg/logregtf.joblib')
VECTORIZER_PATH = Path('wipro/models/simple-tf-logreg/tfvectorizer.joblib')


def abort(msg):
print("ERROR:", msg, file=sys.stderr)
sys.exit(1)


def input_file(filepath: str):
path_to_file = Path(filepath)

if not path_to_file.exists():
abort("No such file or directory: " + path_to_file.name)

return path_to_file


def generate_signature(
arguments: Dict,
signature_format: Literal['plain', 'yara']
):
def generate_signature(arguments):
"""
Generate a signature for the provided binary.
Currently, clava supports two signature formats:
* plain: Returns the internal representation of a signature, used for
debugging.
Generate Yara rule for a sample.
* yara: Returns a yara rule, which can be directly used with
the official yara binaries (or yarac for compilation)
Unless you are working on new models, you'll want 'yara'.
The generated yara rule can be directly used with the official yara binaries
(or yarac for compilation).
NOTE: At the moment, we only support signatures for Portable Executables
(PE) files. Other exectuable formats such as ELF are not supported,
but could be added easily.
"""
# Binary to create signature for
input_sample = input_file(arguments['<file>'])
sample = Path(arguments['<file>'])

# Store the resulting rule in this file (or write to stdout)
output_file = arguments.get('--output', None)
# How many sequences (i.e., opcode n-grams) to include in the signature
num_sequences_in_signatures = int(arguments.get('--n', 3))

print(f"[*] Creating signature for {input_sample.name} "
f"(with: n={num_sequences_in_signatures})")
# How many sequences (i.e., opcode n-grams) to include in the signature
num_sequences_in_signatures = int(arguments.get('--topk', 3))

try:
sample = disassembleX(Path(input_sample))
mode, disassembly = disassemble(sample)
except PEFormatError:
abort(f"Sample '{input_sample.name}' is not a valid PE file.")
abort(f"Sample '{sample.name}' is not a valid PE file.")
except ValueError:
abort(
f"Unable to locate code section in sample: '{input_sample.name}',"
f"Unable to locate code section in sample: '{sample.name}',"
"might be obfuscated, packed or encrypted."
)

# TODO: Actually load the correct strategy from CLI arguments.
strategy = LogRegTF(
'wipro/models/simple-tf-logreg/logregtf.joblib',
'wipro/models/simple-tf-logreg/tfvectorizer.joblib',
num_sequences_in_signatures
)

# `generate` ranks all opcode sequences by their maliciousness and returns
# the top-k sequences, which we'll put 1:1 in a signature.
sequences = strategy.generate(sample, num_sequences_in_signatures)

if signature_format == "yara":
# Format signature as a Yara rule
yara_rule_str = to_yara(
"clava_" + sample.filename,
sequences,
dict(architecture=sample.architecture)
)
model = LogRegClassifier(CLASSIFIER_PATH, VECTORIZER_PATH, ngram_size=6)

# Always display the generated rule
print(yara_rule_str)

if output_file:
with open(output_file, 'w') as fd:
fd.write(yara_rule_str)

print("[*] Yara Rule written to " + output_file)
elif signature_format == "plain":
# plain format is used during development and probably makes not much
# sense to anyone else. This format just outputs the sequence of
# mnemonics in the selected sequences, e.g.,
#
# Signature(
# sequences=[
# 'shl xor xor xor movabs and',
# 'shl lea movsxd shl lea test',
# 'lea shl xor xor xor movabs'
# ])
topk_mnemonics = []
for seq in sequences:
topk_mnemonics.append(" ".join(e.mnemonic for e in seq[1]))

signature = Signature(topk_mnemonics)
print("[!] Got: ", signature)


def dump(arguments):
"""
Dump the instructions of the passed binary.
"""
binary = input_file(arguments['<file>'])
sequences = model.rank(disassembly, topk=num_sequences_in_signatures)

try:
_, instructions = disassemble(
binary,
decode=attrgetter('mnemonic')
)
except PEFormatError:
abort(binary.name + " does not appear to be a valid PE file.")
yara_rule_str = generate_yara_rule("clava_" + sample.name, sequences, mode)

print(yara_rule_str)

print(f"{binary}: ({','.join(instructions)})")
if output_file:
Path(output_file).write_text(yara_rule_str)


def main():
arguments = docopt(__doc__, version="clava v1.0")

if arguments['signature']:
generate_signature(arguments, "plain")

if arguments['yara']:
generate_signature(arguments, "yara")

elif arguments['dump']:
dump(arguments)


if __name__ == "__main__":
main()
generate_signature(arguments)
64 changes: 0 additions & 64 deletions src/clava/corpus.py

This file was deleted.

Loading

0 comments on commit aa20b28

Please sign in to comment.