chore: Cleaning up.

* Adds yara instructions to the README * Removes empty line at various docstrings ending * Removed BaselineClassifier for now, it requires some more work to migrate to clava.
strfx · Sep 30, 2021 · 5a81852 · 5a81852
1 parent f5e82bb
commit 5a81852
Show file tree

Hide file tree

Showing 9 changed files with 126 additions and 193 deletions.
diff --git a/README.md b/README.md
@@ -54,13 +54,26 @@ clava offers a simple CLI to interact. To list all available options, run:
 ```sh
 $ clava -h
 ```
-
 To generate a yara rule based on a sample:
 
 ```sh
 $ clava yara <path/to/sample>
 ```
 
+Use the official Yara binaries to apply the generated rule on your sample
+and / or corpus of samples. The binaries can be downloaded from [here](https://virustotal.github.io/yara/)
+
+For example:
+```sh
+# Generate a rule and store it in a file
+$ clava yara <path/to/sample> -o myrule.yar
+# Apply the yara rule on a directory
+$ yara myrule.yar my-malware-corpus/
+# Compile the yara rule for better performance
+$ yarac myrule.yar myrule-compiled
+$ yara -C myrule-compiled my-malware-corpus/
+```
+
 **Important:** Rules created with clava should **not** directly be used in production, but can assist during rule development. This project is heavily inspired by [yarGen](https://github.com/Neo23x0/yarGen), therefore see also Floriah Roth's [blog post](https://cyb3rops.medium.com/how-to-post-process-yara-rules-generated-by-yargen-121d29322282) *"How to post-process YARA rules generated by yarGen"*.
 
 # Development

diff --git a/src/clava/cli.py b/src/clava/cli.py
@@ -1,12 +1,12 @@
 """clava v1.0: Generate code-based Yara rules.
 
 Usage:
-    clava_cli.py <file> [--topk=<sequences>] [--output=<output file>]
+    clava yara <file> [--topk=<sequences>] [--output=<output file>]
 
 Options:
     -h --help               Show this message.
     --version               Show version of clava.
-    --topk=<sequences>      Number of instruction sequences to use [default: 3]
+    --topk=<sequences>      Number of instruction sequences to use [default: 3].
     --output=<output file>  Store the generated rule in this file.
 
 """
@@ -20,7 +20,7 @@
 from clava.inference import LogRegClassifier
 from clava.output import generate_yara_rule
 
-NGRAM_SIZE = 6
+# TODO: Make paths configurable via CLI.
 CLASSIFIER_PATH = Path('wipro/models/simple-tf-logreg/logregtf.joblib')
 VECTORIZER_PATH = Path('wipro/models/simple-tf-logreg/tfvectorizer.joblib')
 
@@ -32,23 +32,16 @@ def abort(msg):
 
 def generate_signature(arguments):
     """
-    Generate Yara rule for a sample.
-
-    The generated yara rule can be directly used with the official yara binaries
-    (or yarac for compilation).
-
-    NOTE: At the moment, we only support signatures for Portable Executables
-          (PE) files. Other exectuable formats such as ELF are not supported,
-          but could be added easily.
-
+    Generate a Yara rule for given (malware) sample.
     """
     # Binary to create signature for
     sample = Path(arguments['<file>'])
 
-    # Store the resulting rule in this file (or write to stdout)
+    # Store the generated rule in this file if provided; Either way, the rule
+    # will always be written to stdout too.
     output_file = arguments.get('--output', None)
 
-    # How many sequences (i.e., opcode n-grams) to include in the signature
+    # How many sequences (i.e., mnemonic n-grams) to include in the signature
     num_sequences_in_signatures = int(arguments.get('--topk', 3))
 
     try:
@@ -75,4 +68,5 @@ def generate_signature(arguments):
 
 def main():
     arguments = docopt(__doc__, version="clava v1.0")
-    generate_signature(arguments)
+    if arguments['yara']:
+        generate_signature(arguments)
diff --git a/src/clava/disassembling.py b/src/clava/disassembling.py
@@ -1,15 +1,15 @@
 """
-Disassemble input binaries i.e., legitimate and malicious executables.
+Disassemble binaries i.e., legitimate or malicious executables.
 
 Clava uses the Capstone Engine to disassemble binaries. Keep in mind that
 disassembling is a hard and error-prone process and there are many pitfalls.
 We've kept the disassembling straightforward and ignored many edge cases.
 It worked well enough in our experiments, hence we did not pursue any
 improvements here.
 
-Also, clava only supports programs written for x86 architecture and
-in the Portable Executable (PE) format.
-
+Also, clava only supports programs written for x86 architecture and in the
+Portable Executable (PE) format. Other exectuable formats such as ELF are not
+supported, but could be added easily.
 """
 from pathlib import Path
 from typing import List, Tuple
@@ -20,24 +20,24 @@
 
 class NoCodeSectionError(Exception):
     """
-    Raise NoCodeSection when no code section can be parsed from a binary.
-
-    There are usually two cases where we can't decode the code section:
-      * The binary is corrupt or malformatted
-      * The binary is obfuscated, packed or encrypted, where the code will
-        be gradually unpacked / decrypted upon execution.
+    Raise NoCodeSection if we fail to decode an executable's code section.
 
-    Since we currently do not support generating signatures for packed /
-    encrypted binaries, we skip these samples.
+    The most common cases where this can happen are:
+      * The executable is corrupt or malformatted
+      * The executable is obfuscated, packed or encrypted, i.e., the
+        executables code will be unpacked / decrypted upon execution.
 
+    Clava does not explicitely support generating signatures for executables
+    that are packed, encrypted or in any other form obfuscated. 
     """
     pass
 
 
 # An instruction consists of two components: an operation and operands e.g.,
-# 'MOV EBP, ESP' where 'MOV' represents the operation EBP and ESP are the
-# operands. We also keep the raw bytes, since we generate code-based signatures
-# later which match on the actual bytes, not on disassembled instructions.
+# 'MOV EBP, ESP' where 'MOV' represents the operation, and 'EBP' and 'ESP' are
+# the operands. We also keep the raw bytes, since we generate code-based
+# signatures later which must match on the actual bytes, not the disassembled
+# instructions.
 #
 # Example: ("MOV", "EBP, ESP", b"\x89\xE5")
 DisassembledInstruction = Tuple[str, str, bytearray]
@@ -48,7 +48,7 @@ def disassemble(executable_location: Path) -> Tuple[int, List[DisassembledInstru
     Disassemble an executable.
 
     To disassemble an executable, we locate the program's code section,
-    extract the raw bytes and then decode the instructions from these bytes.
+    extract the raw bytes and then decode the instructions.
 
     Args:
         executable_location: Location of the executable to be disassembled. This
@@ -66,7 +66,6 @@ def disassemble(executable_location: Path) -> Tuple[int, List[DisassembledInstru
         NoCodeSectionFoundError: Raised when the binary has no code section, or
             if the section could not be decoded (e.g., binary is obfuscated or
             encrypted). 
-
     """
     pe = pefile.PE(str(executable_location))
 

diff --git a/src/clava/inference.py b/src/clava/inference.py
@@ -9,6 +9,7 @@
 program that are most likely to appear in malware rather than goodware) and
 generate the signature.
 
+TODO: Update this documentation as it does not apply to all classifiers anymore.
 """
 import random
 from pathlib import Path
@@ -23,8 +24,6 @@
 # RankedInstructions holds the instruction sequence along with the
 # class probabilities for that sequence being more likely found in
 # malware or legitimate software.
-#
-# TODO: This could be further simplified.
 RankedInstructions = Tuple[
     # First tuple holds the class probabilities for the instruction sequence,
     # where first item is p(malicious) and second is p(legitimate),
@@ -45,33 +44,32 @@ def rank(self, instructions: List[DisassembledInstruction], topk: int) -> List[R
         Rank a list of instruction sequences by their maliciousness.
 
         Args:
-            instructions: A list of instructions, i.e., usually a disassembled
-                program, but rank() accepts any sequence of instructions.
+            instructions: 
+                A list of instructions, usually a disassembled program, but
+                rank() accepts any sequence of instructions.
 
             topk: Only keep the k most "malicious" sequences.
 
         Returns:
-            A ranked list of RankedInstructions, sorted by their maliciousness,
-            i.e., the probability of how likely this instruction appears in
+            A list of RankedInstructions sorted by their maliciousness, i.e.,
+            the probability of how likely this instruction sequence appears in
             malware. Sorted in descending order, meaning most malicious first.
 
-            Classifier must always return RankedInstruction with the *complete*
-            instruction (DisassembledInstruction). Must kept in mind since
-            a classifier might only use a subset of the features.
-
+            Classifier must always return RankedInstructions with the *complete*
+            instruction (DisassembledInstruction). Must kept in mind since a
+            classifier might only use a subset of the features.
         """
         pass
 
 
 class LogRegClassifier(Classifier):
     """
-    LogRegClassifier uses the logistic regression model.
+    Ranks instruction sequences with the fitted logistic regression model.
 
     This classifier implements the main work of clava as of now, the full
     procedure is described in the paper. TL:DR; We fitted a logistic regression
     on the Term Frequency weights of mnemonics based on their appearances in
     malicious and legitimate software.
-
     """
 
     def __init__(self, classifier_path: Path, vectorizer_path: Path, ngram_size=6):
@@ -80,35 +78,31 @@ def __init__(self, classifier_path: Path, vectorizer_path: Path, ngram_size=6):
         self.ngram_size = ngram_size
 
     def rank(self, instructions: List[DisassembledInstruction], topk: int) -> List[RankedInstructions]:
-        # Build ngrams:
-        # Generate n-grams of the disassembly to generate more candidates,
-        # then classify the n-grams.
-        disassembly_ngrams = list(nltk.ngrams(instructions, self.ngram_size))
-
-        # The model uses ngrams of mnemonics, e.g.
-        #   [('add', 'push', 'xor'), ('push', 'xor', 'and'), ...]
-        # But to generate the rule, we require the full raw bytes.
-        # Therefore, we create n-grams on the full binary, and then
-        # extract the mnemonics from them. Then we can re-combine them later.
-        # We use a generator expression for that
-        # documents = ["add push xor", "push xor and"]
-        documents = (
+        # Using n-grams, we can multiply the number of potential signature
+        # components.
+        instruction_ngrams = list(nltk.ngrams(instructions, self.ngram_size))
+
+        # The model was only trained on a programs mnemonics like 'push', 'xor',
+        # etc. Therefore, we extract the mnemonics in a second list, classify
+        # them and combine them with the complete instructions later.
+        #
+        # Example: list(mnemonic_ngrams) -> ["add push xor", "push xor and"]
+        mnemonic_ngrams = (
             " ".join(instr[0] for instr in ngram)
-            for ngram in disassembly_ngrams
+            for ngram in instruction_ngrams
         )
 
-        # Transform opcode sequence into TF vector
-        transformed = self.vectorizer.transform(documents)
+        # Transform the mnemonic sequences into term-frequency vectors.
+        transformed = self.vectorizer.transform(mnemonic_ngrams)
 
-        # Classify transformed vectors
-        # predict_proba returns the class probabilites for each sample
-        # a list of lists, e.g.
-        # [[0.4, 0.6], [0.7, 0.3], ...]
+        # Classify transformed vectors using fitted model, predict_proba
+        # returns a list of tuples, each representing the class probabilites
+        # of each sample, e.g., [[0.4, 0.6], [0.7, 0.3], ...]
         # Therefore, we need to re-join them with the disassembly ngrams.
         probabilities = self.classifier.predict_proba(transformed)
 
-        # Combine classification results with ngrams
-        # sequences = [(scores, ngrams)]
+        # Combine the class probabilities with the original n-grams.
+        # Example: list(sequences) ->
         # [
         #   (
         #       (array([0.49666687, 0.50333313]),
@@ -119,90 +113,19 @@ def rank(self, instructions: List[DisassembledInstruction], topk: int) -> List[R
         #      )
         #   )
         # ]
-        sequences = list(zip(probabilities, disassembly_ngrams))
+        sequences = list(zip(probabilities, instruction_ngrams))
 
         return sort_by_maliciousness(sequences)[:topk]
 
 
-class BaselineClassifier(Classifier):
-    """
-    Implements the static baseline approach.
-
-    The baseline approach is documented in the paper. TL;DR: It is simply
-    data-mining a large number of legitimate software samples to remove all
-    possible instruction sequences that appear in the goodware corpus. This
-    approach guarantees that within that corpus (and only within that) there are
-    no false positive matches (i.e., the instruction sequence matches a goodware
-    sample).  However, this approach is computationally very, very expensive.
-
-    """
-
-    def __init__(self, corpus_benign):
-        self.corpus_benign = corpus_benign
-
-    def rank(self, instruction_sequences, topk):
-        # p
-        # Split sample into sequences of length n (non overlapping)
-        # unique since we do not care about duplicate sequences
-        sequences = unique_chunks(instruction_sequences, n=6)
-
-        mnemonics_only = (
-            " ".join(instr[0] for instr in ngram)
-            for ngram in sequences
-        )
-
-        # Track sequences that also appear in goodware.
-        appear_in_goodware = set()
-
-        # Manually check each sequence against each goodware sample
-        # to filter out sequences that also appear in goodware.
-        for goodware in self.corpus_benign.samples:
-            for sequence in mnemonics_only:
-                if " ".join(sequence) in goodware.code:
-                    appear_in_goodware.add(sequence)
-
-        candidates = mnemonics_only - appear_in_goodware
-        if len(candidates) < 1:
-            return None
-
-        ranked = []
-
-        # Randomly choose candidates for signature
-        candidates = random.choices(tuple(candidates), k=topk)
-        for i in range(topk):
-            ranked.append(
-                (1, 0),
-                candidates[i]
-            )
-
-        return ranked
-
-        # Form signature string from individual sets
-        # candidates = [" ".join(candidate) for candidate in candidates]
-
-        # p(malicious) = 1, p(benign) = 0
-        # Comply with the return format
-
-        # p_malicious, p_legit = 1, 0
-
-        # x = [
-        #     (p_malicious, p_legit), (None, None, )
-        # ]
-
-        # return candidates
-
-
 class DummyClassifier(Classifier):
     """
-    Dummy to demonstrate how to implement a (simple) classifier.
+    Ranks instruction sequences randomly.
 
-    Mostly used for testing purposes, but can also act as a baseline.
+    Used for testing and to show, how to implement a simple classifier.
     """
 
     def rank(self, instructions, topk) -> List[RankedInstructions]:
-        """
-        Randomly rank instruction sequences.
-        """
         ranked = []
 
         for i in range(topk):
@@ -217,7 +140,5 @@ def rank(self, instructions, topk) -> List[RankedInstructions]:
 
 
 def sort_by_maliciousness(ranked_not_sorted: List[RankedInstructions]) -> List[RankedInstructions]:
-    """
-    Sort instruction sequences by their "maliciousness"
-    """
+    """ Sort instruction sequences by their maliciousness. """
     return sorted(ranked_not_sorted, key=lambda seq: seq[0][0], reverse=True)
diff --git a/src/clava/output.py b/src/clava/output.py
@@ -8,11 +8,7 @@
 
 
 def generate_yara_rule(name: str, sequences, mode: int) -> str:
-    """
-    Returns signature formatted as a Yara rule.
-
-    Returns Yara rule as string.
-    """
+    """ Returns signature formatted as a Yara rule. """
 
     # Metadata strings must always be wrapped in quotes.
     metadata = {