Source code for qbindiff.loader.backend.binexport

# Copyright 2023 Quarkslab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""BinExport backend loader
"""

# builtin imports
from __future__ import annotations
import logging
import weakref
from typing import Any, TypeAlias, TYPE_CHECKING
from collections.abc import Iterator
from functools import cached_property

# third-party imports
import capstone  # type: ignore[import-untyped]
import binexport  # type: ignore[import-untyped]
import binexport.types  # type: ignore[import-untyped]
import networkx

# local imports
from qbindiff.loader.backend import (
    AbstractProgramBackend,
    AbstractFunctionBackend,
    AbstractBasicBlockBackend,
    AbstractInstructionBackend,
    AbstractOperandBackend,
)
from qbindiff.loader.backend.utils import convert_operand_type
from qbindiff.loader import Structure
from qbindiff.utils import log_once
from qbindiff.loader.types import (
    FunctionType,
    ReferenceType,
    ReferenceTarget,
    OperandType,
    InstructionGroup,
    ProgramCapability,
)

if TYPE_CHECKING:
    from qbindiff.types import Addr

# Type aliases
beFunction: TypeAlias = binexport.function.FunctionBinExport
beBasicBlock: TypeAlias = binexport.basic_block.BasicBlockBinExport
capstoneOperand: TypeAlias = Any  # Relaxed typing


# === General purpose utils functions ===
def _get_capstone_disassembler(binexport_arch: str, mode: int = 0):
    def capstone_context(arch, mode):
        context = capstone.Cs(arch, mode)
        context.detail = True
        return context

    if binexport_arch == "x86-32":
        return capstone_context(capstone.CS_ARCH_X86, capstone.CS_MODE_32 | mode)
    elif binexport_arch == "x86-64":
        return capstone_context(capstone.CS_ARCH_X86, capstone.CS_MODE_64 | mode)
    elif binexport_arch == "ARM-32":
        return capstone_context(capstone.CS_ARCH_ARM, mode)
    elif binexport_arch == "ARM-64":
        return capstone_context(capstone.CS_ARCH_ARM64, mode)
    elif binexport_arch == "MIPS-32":
        return capstone_context(capstone.CS_ARCH_MIPS, capstone.CS_MODE_32 | mode)
    elif binexport_arch == "MIPS-64":
        return capstone_context(capstone.CS_ARCH_MIPS, capstone.CS_MODE_64 | mode)

    raise NotImplementedError(f"Architecture {binexport_arch} has not be implemented")


def is_same_mnemonic(mnemonic1: str, mnemonic2: str) -> bool:
    """Check whether two mnemonics are the same"""

    def normalize(mnemonic: str) -> str:
        if mnemonic == "ldmia":
            return "ldm"
        return mnemonic.replace("lo", "cc")

    mnemonic1 = normalize(mnemonic1)
    mnemonic2 = normalize(mnemonic2)

    if mnemonic1 == mnemonic2:
        return True

    if len(mnemonic1) > len(mnemonic2):
        mnemonic1, mnemonic2 = mnemonic2, mnemonic1
    if mnemonic1 + ".w" == mnemonic2:
        return True

    return False


def parse_architecture_flag(arch_mode_str: str) -> capstone.Cs | None:
    """Return the capstone architecture corresponding to the string passed as parameter.
    The format should be something like 'CS_ARCH_any:[CS_MODE_any, ...]"""

    separator = ":"
    # Replace trailing spaces
    split = arch_mode_str.replace(" ", "").split(separator)
    # Check for only one separator
    if len(split) != 2:
        return None

    # Unpack arch and mode
    arch_str, mode_str = split
    arch = capstone.__dict__.get(arch_str)
    if arch == None:
        return None

    mode = 0
    # Look for one or more modes
    for m in mode_str.split(","):
        mode_attr = capstone.__dict__.get(m)
        if mode_attr == None:
            return None
        # Capstone mode is a bitwise enum
        mode |= mode_attr

    # Arch and Mode are valid capstone attributes, instantiate Cs object
    cs = capstone.Cs(arch, mode)
    if cs:
        # Enable details for operand
        cs.detail = True
    return cs


# =======================================


[docs] class OperandBackendBinExport(AbstractOperandBackend): def __init__( self, cs: capstone.Cs, cs_instruction: capstone.CsInsn, cs_operand: capstoneOperand, cs_operand_position: int, ): super(OperandBackendBinExport, self).__init__() self.cs = cs self.cs_instr = cs_instruction self.cs_operand = cs_operand self.cs_operand_position = cs_operand_position def __str__(self) -> str: return self.cs_instr.op_str.split(",")[self.cs_operand_position] @property def value(self) -> int | None: """ Return the immediate value (not addresses) used by the operand. """ if self.is_immediate(): return self.cs_operand.value.imm return None @property def type(self) -> OperandType: """Returns the capstone operand type""" return convert_operand_type(self.cs.arch, self.cs_operand)
[docs] def is_immediate(self) -> bool: """Returns whether the operand is an immediate value (not considering addresses)""" # Ignore jumps since the target is an immediate return self.type == OperandType.immediate and not self.cs_instr.group(capstone.CS_GRP_JUMP)
[docs] class InstructionBackendBinExport(AbstractInstructionBackend): def __init__(self, cs: capstone.Cs, cs_instruction: capstone.CsInsn): super(InstructionBackendBinExport, self).__init__() self.cs = cs self.cs_instr = cs_instruction @property def addr(self) -> Addr: return self.cs_instr.address @property def mnemonic(self) -> str: return self.cs_instr.mnemonic @property def references(self) -> dict[ReferenceType, list[ReferenceTarget]]: """ Returns all the references towards the instruction BinExport only exports data references' address so no data type nor value. """ return {} # Not supported @property def operands(self) -> Iterator[OperandBackendBinExport]: """Returns an iterator over backend operand objects""" if self.cs_instr is None: return iter([]) return ( OperandBackendBinExport(self.cs, self.cs_instr, o, i) for i, o in enumerate(self.cs_instr.operands) ) @property def groups(self) -> list[InstructionGroup]: """ Returns a list of groups of this instruction. """ return list(map(InstructionGroup.from_capstone, self.cs_instr.groups)) @property def id(self) -> int: """Return the capstone instruction ID""" return self.cs_instr.id @property def comment(self) -> str: return "" # Not supported @property def bytes(self) -> bytes: """Returns the bytes representation of the instruction""" return bytes(self.cs_instr.bytes)
[docs] class BasicBlockBackendBinExport(AbstractBasicBlockBackend): def __init__(self, program: weakref.ref[ProgramBackendBinExport], be_block: beBasicBlock): super(BasicBlockBackendBinExport, self).__init__() self._program = program self.be_block = be_block def __len__(self) -> int: """ The numbers of instructions in the basic block """ return len(self.be_block) def _guess_thumb_context(self, instr_bytes: bytes, mnemonic: str) -> int: """Guess wether the instruction is thumb or not""" if len(instr_bytes) < 2: # Must be an error raise ValueError(f"Instruction malformed of size {len(instr_bytes)} bytes.") if len(instr_bytes) == 2: # Must be thumb return capstone.CS_MODE_THUMB # Might be either thumb or normal arm. # There is no easy way of knowing whether a 4/8 bytes instruction is thumb or not # and IDA sometimes likes to merge two instructions together (so two 4bytes thumb # instructions might become a single 8bytes thumb instruction from IDA perspective). # The only way of checking if the context is correct is by comparing the mnemonic # of the capstone instruction with the one in BinExport. # Of course we have to rely on heuristics to know whether the two mnemonics are the # same or not. log_once( logging.WARNING, f"Relying on heuristics to guess the context mode of the binary (thumb or not)", ) # Save the original mode arch = self.program.architecture_name # Bruteforce-guessing the context for capstone_mode in [capstone.CS_MODE_ARM, capstone.CS_MODE_THUMB]: # try both modes # Disassemble the instruction and check the mnemonic disassembler = _get_capstone_disassembler(arch, capstone_mode) disasm = disassembler.disasm(instr_bytes, self.addr) try: instr = next(disasm) # Check if the mnemonic is the same if is_same_mnemonic(instr.mnemonic, mnemonic): return capstone_mode except StopIteration: pass # We have not being lucky raise TypeError( f"Cannot guess {self.program.name} ISA for the instruction at address {self.addr:#x} " f"(consider setting it manually)" ) def _disassemble( self, bb_asm: bytes, correct_mnemonic: str, correct_size: int ) -> list[capstone.CsInsn]: """ Disassemble the basic block using capstone trying to guess the instruction set when unable to determine it from binexport. :param bb_asm: basic block raw bytes :param correct_mnemonic: The correct mnemonic of the first instruction of the basic block :param correct_size: The right size of the first instruction of the basic block """ # Check if we already have a capstone context, if so use it if self.program.cs: return list(self.program.cs.disasm(bb_asm, self.addr)) # Continue with the old method arch = self.program.architecture_name capstone_mode = 0 # No need to guess the context for these arch if arch in ("x86", "x86-64", "MIPS-32", "MIPS-64", "ARM-64"): pass # For arm thumb use appropriate context guessing heuristics elif arch == "ARM-32": capstone_mode = self._guess_thumb_context(bb_asm[:correct_size], correct_mnemonic) # Everything else not yet supported else: raise NotImplementedError(f"The architecture {arch} is not yet supported in QBinDiff") # Set the program wide disassembler self.program.cs = _get_capstone_disassembler(arch, capstone_mode) return list(self.program.cs.disasm(bb_asm, self.addr)) @property def program(self) -> ProgramBackendBinExport: """Wrapper on weak reference on ProgramBackendBinExport""" if (program := self._program()) is None: raise RuntimeError( "Trying to access an already expired weak reference on ProgramBackendBinExport" ) return program @property def addr(self) -> Addr: return self.be_block.addr @property def instructions(self) -> Iterator[InstructionBackendBinExport]: """Returns an iterator over backend instruction objects""" # Generates the first instruction and use it to guess the context for capstone first_instr = next(iter(self.be_block.instructions.values())) capstone_instructions = self._disassemble( self.be_block.bytes, first_instr.mnemonic, len(first_instr.bytes) ) # Then iterate over the instructions return ( InstructionBackendBinExport(self.program.cs, instr) for instr in capstone_instructions ) @property def bytes(self) -> bytes: return b"".join(x.bytes for x in self.instructions)
[docs] class FunctionBackendBinExport(AbstractFunctionBackend): def __init__(self, program: weakref.ref[ProgramBackendBinExport], be_func: beFunction): super(FunctionBackendBinExport, self).__init__() self.be_func = be_func self._program = program @property def basic_blocks(self) -> Iterator[BasicBlockBackendBinExport]: """Returns an iterator over backend basic blocks objects""" return ( BasicBlockBackendBinExport(self._program, bb) for addr, bb in self.be_func.blocks.items() ) @property def addr(self) -> Addr: """The address of the function""" return self.be_func.addr @property def graph(self) -> "networkx.DiGraph": """The Control Flow Graph of the function""" return self.be_func.graph @property def parents(self) -> set[Addr]: """Set of function parents in the call graph""" return {func.addr for func in self.be_func.parents} @property def children(self) -> set[Addr]: """Set of function children in the call graph""" return {func.addr for func in self.be_func.children} @cached_property def type(self) -> FunctionType: """The type of the function (as defined by IDA)""" f_type = self.be_func.type if f_type == binexport.types.FunctionType.NORMAL: return FunctionType.normal elif f_type == binexport.types.FunctionType.LIBRARY: return FunctionType.library elif f_type == binexport.types.FunctionType.IMPORTED: return FunctionType.imported elif f_type == binexport.types.FunctionType.THUNK: return FunctionType.thunk elif f_type == binexport.types.FunctionType.INVALID: return FunctionType.invalid else: raise NotImplementedError(f"Function type {f_type} not implemented") @property def name(self): return self.be_func.name
[docs] def is_import(self) -> bool: """True if the function is imported""" # Should we consider also FunctionType.thunk? return self.type in (FunctionType.imported, FunctionType.extern)
[docs] class ProgramBackendBinExport(AbstractProgramBackend): def __init__(self, file: str, *, arch: str | None = None): super().__init__() self.be_prog = binexport.ProgramBinExport(file) self.architecture_name = self.be_prog.architecture self._fun_names: dict[str, Addr] = {} # {fun_name : fun_address} self.cs = None # Check if the architecture is set by the user if arch: # Parse the architecture self.cs = parse_architecture_flag(arch) if not self.cs: raise Exception("Unable to instantiate capstone context from given arch: %s" % arch) else: logging.info( "No architecture set but BinExport backend is used. If invalid instructions" " are found consider setting manually the architecture" ) # self.cs will be set at basic block level # self.cs = _get_capstone_disassembler(self.be_prog.architecture) def __repr__(self) -> str: return f"<{type(self).__name__}:{self.name}>" @property def functions(self) -> Iterator[FunctionBackendBinExport]: """Returns an iterator over backend function objects""" functions = [] for addr, func in self.be_prog.items(): f = FunctionBackendBinExport(weakref.ref(self), func) functions.append(f) self._fun_names[f.name] = f.addr return iter(functions) @property def name(self): return self.be_prog.name @property def structures(self) -> list[Structure]: """ Returns the list of structures defined in program. WARNING: Not supported by BinExport """ return [] # Not supported @property def callgraph(self) -> "networkx.DiGraph": return self.be_prog.callgraph @property def fun_names(self) -> dict[str, int]: """ Returns a dictionary with function name as key and the function address as value """ return self._fun_names @property def exec_path(self) -> str: """ Guess the raw binary name by removing the final .BinExport """ return self.name.replace(".BinExport", "") @property def capabilities(self) -> ProgramCapability: """ Returns the supported capabilities """ return ProgramCapability.INSTR_GROUP