# Copyright 2023 Quarkslab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generic features
"""
import re
import random
from typing import Any, no_type_check
from re import Pattern
from qbindiff.features.extractor import (
FeatureCollector,
FunctionFeatureExtractor,
InstructionFeatureExtractor,
OperandFeatureExtractor,
)
from qbindiff.loader.types import DataType, ReferenceType
from qbindiff.loader import (
Program,
Function,
Instruction,
Operand,
Data,
Structure,
StructureMember,
)
[docs]
class Address(FunctionFeatureExtractor):
"""
Address of the function as a feature
"""
key = "addr"
[docs]
def visit_function(self, _: Program, function: Function, collector: FeatureCollector) -> None:
value = function.addr
collector.add_feature(self.key, value)
[docs]
class DatName(InstructionFeatureExtractor):
"""
References to data in the instruction (as retrieved by the backend loader).
This feature maps the data value to the number of reference occurences to it.
It's a superset of :py:obj:`StrRef` feature.
"""
key = "dat"
help_msg = """
References to data in the instruction (as retrieved by the backend loader).
This feature maps the data value to the number of reference occurences to it.
It's a superset of StrRef (strref) feature.
""".strip()
# MyPy still does not support type inference on structural pattern matching statement
[docs]
@no_type_check
def visit_instruction(
self, _: Program, instruction: Instruction, collector: FeatureCollector
) -> None:
for ref_type, references in instruction.references.items():
for reference in references:
match (ref_type, reference):
case (ReferenceType.DATA, Data):
if reference.type != DataType.UNKNOWN and reference.value is not None:
collector.add_dict_feature(self.key, {reference.value: 1})
case (ReferenceType.STRUC, Structure):
collector.add_dict_feature(self.key, {reference.name: 1})
case (ReferenceType.STRUC, StructureMember):
collector.add_dict_feature(
self.key, {reference.structure.name + "." + reference.name: 1}
)
case (ReferenceType.ENUM, _):
logging.warning("Unhandled case of enum reference in DatName feature")
pass
case _:
assert False, f"Malformed reference. {reference=} {ref_type=}"
[docs]
class StrRef(InstructionFeatureExtractor):
"""
References to strings in the instruction.
This feature maps the string to the number of occurences to it.
"""
key = "strref"
[docs]
def visit_instruction(
self, _: Program, instruction: Instruction, collector: FeatureCollector
) -> None:
for data in instruction.data_references:
if data.type == DataType.ASCII:
collector.add_dict_feature(self.key, {data.value: 1})
[docs]
class Constant(OperandFeatureExtractor):
"""
Numeric constant (32/64bits) in the instruction (not addresses).
This maps numerical values to the number of occurences to it.
It excludes the addresses (relies on IDA to discriminate them).
"""
key = "cst"
[docs]
def visit_operand(self, _: Program, operand: Operand, collector: FeatureCollector) -> None:
if operand.is_immediate():
collector.add_dict_feature(self.key, {str(operand.value): 1}) # This should be a string
[docs]
class FuncName(FunctionFeatureExtractor):
"""
Match the function names.
Optionally the constructor takes a regular expression pattern to exclude function names
"""
key = "fname"
def __init__(self, *args: Any, excluded_regex: Pattern[str] | None = None, **kwargs: Any):
"""
:param args: parameters of a feature extractor
:param excluded_regex: regex to apply in order to exclude names
:param kwargs: keyworded arguments
"""
super(FuncName, self).__init__(*args, **kwargs)
self._excluded_regex = excluded_regex
[docs]
def is_excluded(self, function: Function) -> bool:
"""
Returns if the function should be excluded (and not considered) based on an optional regex
:param function: function to consider
:return: bool
"""
if self._excluded_regex is None:
return bool(re.match(rf"^(sub|fun)_0*{function.addr:x}$", function.name, re.IGNORECASE))
else:
return bool(self._excluded_regex.match(function.name))
[docs]
def visit_function(self, _: Program, function: Function, collector: FeatureCollector) -> None:
if self.is_excluded(function):
# We cannot properly exclude the name since a zero feature vector will
# have a distance of zero (hence similarity of 1) with any other zero
# feature vector. Hence, add a good enough random number to reduce the
# chance of a collision
collector.add_dict_feature(
self.key, {function.name + str(random.randrange(1000000000)): 1}
)
else:
collector.add_dict_feature(self.key, {function.name: 1})