1096 lines
39 KiB
Python
Executable File
1096 lines
39 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright 2021 The Chromium Authors
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
"""Creates a table of unwind information in Android Chrome's bespoke format."""
|
|
|
|
import abc
|
|
import argparse
|
|
import collections
|
|
import enum
|
|
import json
|
|
import logging
|
|
import re
|
|
import struct
|
|
import subprocess
|
|
import sys
|
|
from typing import (Dict, Iterable, List, NamedTuple, Sequence, TextIO, Tuple,
|
|
Union)
|
|
|
|
from util import build_utils
|
|
|
|
_STACK_CFI_INIT_REGEX = re.compile(
|
|
r'^STACK CFI INIT ([0-9a-f]+) ([0-9a-f]+) (.+)$')
|
|
_STACK_CFI_REGEX = re.compile(r'^STACK CFI ([0-9a-f]+) (.+)$')
|
|
|
|
|
|
class AddressCfi(NamedTuple):
|
|
"""Record representing CFI for an address within a function.
|
|
|
|
Represents the Call Frame Information required to unwind from an address in a
|
|
function.
|
|
|
|
Attributes:
|
|
address: The address.
|
|
unwind_instructions: The unwind instructions for the address.
|
|
|
|
"""
|
|
address: int
|
|
unwind_instructions: str
|
|
|
|
|
|
class FunctionCfi(NamedTuple):
|
|
"""Record representing CFI for a function.
|
|
|
|
Note: address_cfi[0].address is the start address of the function.
|
|
|
|
Attributes:
|
|
size: The function size in bytes.
|
|
address_cfi: The CFI at each address in the function.
|
|
|
|
"""
|
|
size: int
|
|
address_cfi: Tuple[AddressCfi, ...]
|
|
|
|
|
|
def FilterToNonTombstoneCfi(stream: TextIO) -> Iterable[str]:
|
|
"""Generates non-tombstone STACK CFI lines from the stream.
|
|
|
|
STACK CFI functions with address 0 correspond are a 'tombstone' record
|
|
associated with dead code and can be ignored. See
|
|
https://bugs.llvm.org/show_bug.cgi?id=47148#c2.
|
|
|
|
Args:
|
|
stream: A file object.
|
|
|
|
Returns:
|
|
An iterable over the non-tombstone STACK CFI lines in the stream.
|
|
"""
|
|
in_tombstone_function = False
|
|
for line in stream:
|
|
if not line.startswith('STACK CFI '):
|
|
continue
|
|
|
|
if line.startswith('STACK CFI INIT 0 '):
|
|
in_tombstone_function = True
|
|
elif line.startswith('STACK CFI INIT '):
|
|
in_tombstone_function = False
|
|
|
|
if not in_tombstone_function:
|
|
yield line
|
|
|
|
|
|
def ReadFunctionCfi(stream: TextIO) -> Iterable[FunctionCfi]:
|
|
"""Generates FunctionCfi records from the stream.
|
|
|
|
Args:
|
|
stream: A file object.
|
|
|
|
Returns:
|
|
An iterable over FunctionCfi corresponding to the non-tombstone STACK CFI
|
|
lines in the stream.
|
|
"""
|
|
current_function_address = None
|
|
current_function_size = None
|
|
current_function_address_cfi = []
|
|
for line in FilterToNonTombstoneCfi(stream):
|
|
cfi_init_match = _STACK_CFI_INIT_REGEX.search(line)
|
|
if cfi_init_match:
|
|
# Function CFI with address 0 are tombstone entries per
|
|
# https://bugs.llvm.org/show_bug.cgi?id=47148#c2 and should have been
|
|
# filtered in `FilterToNonTombstoneCfi`.
|
|
assert current_function_address != 0
|
|
if (current_function_address is not None
|
|
and current_function_size is not None):
|
|
yield FunctionCfi(current_function_size,
|
|
tuple(current_function_address_cfi))
|
|
current_function_address = int(cfi_init_match.group(1), 16)
|
|
current_function_size = int(cfi_init_match.group(2), 16)
|
|
current_function_address_cfi = [
|
|
AddressCfi(int(cfi_init_match.group(1), 16), cfi_init_match.group(3))
|
|
]
|
|
else:
|
|
cfi_match = _STACK_CFI_REGEX.search(line)
|
|
assert cfi_match
|
|
current_function_address_cfi.append(
|
|
AddressCfi(int(cfi_match.group(1), 16), cfi_match.group(2)))
|
|
|
|
assert current_function_address is not None
|
|
assert current_function_size is not None
|
|
yield FunctionCfi(current_function_size, tuple(current_function_address_cfi))
|
|
|
|
|
|
def EncodeAsBytes(*values: int) -> bytes:
|
|
"""Encodes the argument ints as bytes.
|
|
|
|
This function validates that the inputs are within the range that can be
|
|
represented as bytes.
|
|
|
|
Args:
|
|
values: Integers in range [0, 255].
|
|
|
|
Returns:
|
|
The values encoded as bytes.
|
|
"""
|
|
for i, value in enumerate(values):
|
|
if not 0 <= value <= 255:
|
|
raise ValueError('value = %d out of bounds at byte %d' % (value, i))
|
|
return bytes(values)
|
|
|
|
|
|
def Uleb128Encode(value: int) -> bytes:
|
|
"""Encodes the argument int to ULEB128 format.
|
|
|
|
Args:
|
|
value: Unsigned integer.
|
|
|
|
Returns:
|
|
The values encoded as ULEB128 bytes.
|
|
"""
|
|
if value < 0:
|
|
raise ValueError(f'Cannot uleb128 encode negative value ({value}).')
|
|
|
|
uleb128_bytes = []
|
|
done = False
|
|
while not done:
|
|
value, lowest_seven_bits = divmod(value, 0x80)
|
|
done = value == 0
|
|
uleb128_bytes.append(lowest_seven_bits | (0x80 if not done else 0x00))
|
|
return EncodeAsBytes(*uleb128_bytes)
|
|
|
|
|
|
def EncodeStackPointerUpdate(offset: int) -> bytes:
|
|
"""Encodes a stack pointer update as arm unwind instructions.
|
|
|
|
Args:
|
|
offset: Offset to apply on stack pointer. Should be in range [-0x204, inf).
|
|
|
|
Returns:
|
|
A list of arm unwind instructions as bytes.
|
|
"""
|
|
assert offset % 4 == 0
|
|
|
|
abs_offset = abs(offset)
|
|
instruction_code = 0b01000000 if offset < 0 else 0b00000000
|
|
if 0x04 <= abs_offset <= 0x200:
|
|
instructions = [
|
|
# vsp = vsp + (xxxxxx << 2) + 4. Covers range 0x04-0x100 inclusive.
|
|
instruction_code | ((min(abs_offset, 0x100) - 4) >> 2)
|
|
]
|
|
# For vsp increments of 0x104-0x200 we use 00xxxxxx twice.
|
|
if abs_offset >= 0x104:
|
|
instructions.append(instruction_code | ((abs_offset - 0x100 - 4) >> 2))
|
|
try:
|
|
return EncodeAsBytes(*instructions)
|
|
except ValueError as e:
|
|
raise RuntimeError('offset = %d produced out of range value' %
|
|
offset) from e
|
|
else:
|
|
# This only encodes positive sp movement.
|
|
assert offset > 0, offset
|
|
return EncodeAsBytes(0b10110010 # vsp = vsp + 0x204 + (uleb128 << 2)
|
|
) + Uleb128Encode((offset - 0x204) >> 2)
|
|
|
|
|
|
def EncodePop(registers: Sequence[int]) -> bytes:
|
|
"""Encodes popping of a sequence of registers as arm unwind instructions.
|
|
|
|
Args:
|
|
registers: Collection of target registers to accept values popped from
|
|
stack. Register value order in the sequence does not matter. Values are
|
|
popped based on register index order.
|
|
|
|
Returns:
|
|
A list of arm unwind instructions as bytes.
|
|
"""
|
|
assert all(
|
|
r in range(4, 16)
|
|
for r in registers), f'Can only pop r4 ~ r15. Registers:\n{registers}.'
|
|
assert len(registers) > 0, 'Register sequence cannot be empty.'
|
|
|
|
instructions: List[int] = []
|
|
|
|
# Check if the pushed registers are continuous set starting from r4 (and
|
|
# ending prior to r12). This scenario has its own encoding.
|
|
pop_lr = 14 in registers
|
|
non_lr_registers = [r for r in registers if r != 14]
|
|
non_lr_registers_continuous_from_r4 = \
|
|
sorted(non_lr_registers) == list(range(4, 4 + len(non_lr_registers)))
|
|
|
|
if (pop_lr and 0 < len(non_lr_registers) <= 8
|
|
and non_lr_registers_continuous_from_r4):
|
|
instructions.append(0b10101000
|
|
| (len(non_lr_registers) - 1) # Pop r4-r[4+nnn], r14.
|
|
)
|
|
else:
|
|
register_bits = 0
|
|
for register in registers:
|
|
register_bits |= 1 << register
|
|
register_bits = register_bits >> 4 # Skip r0 ~ r3.
|
|
instructions.extend([
|
|
# Pop up to 12 integer registers under masks {r15-r12}, {r11-r4}.
|
|
0b10000000 | (register_bits >> 8),
|
|
register_bits & 0xff
|
|
])
|
|
|
|
return EncodeAsBytes(*instructions)
|
|
|
|
|
|
class UnwindType(enum.Enum):
|
|
"""
|
|
The type of unwind action to perform.
|
|
"""
|
|
|
|
# Use lr as the return address.
|
|
RETURN_TO_LR = 1
|
|
|
|
# Increment or decrement the stack pointer and/or pop registers (r4 ~ r15).
|
|
# If both, the increment/decrement occurs first.
|
|
UPDATE_SP_AND_OR_POP_REGISTERS = 2
|
|
|
|
# Restore the stack pointer from a register then increment/decrement the stack
|
|
# pointer.
|
|
RESTORE_SP_FROM_REGISTER = 3
|
|
|
|
# No action necessary. Used for floating point register pops.
|
|
NO_ACTION = 4
|
|
|
|
|
|
class AddressUnwind(NamedTuple):
|
|
"""Record representing unwind information for an address within a function.
|
|
|
|
Attributes:
|
|
address_offset: The offset of the address from the start of the function.
|
|
unwind_type: The type of unwind to perform from the address.
|
|
sp_offset: The offset to apply to the stack pointer.
|
|
registers: The registers involved in the unwind.
|
|
"""
|
|
address_offset: int
|
|
unwind_type: UnwindType
|
|
sp_offset: int
|
|
registers: Tuple[int, ...]
|
|
|
|
|
|
class FunctionUnwind(NamedTuple):
|
|
"""Record representing unwind information for a function.
|
|
|
|
Attributes:
|
|
address: The address of the function.
|
|
size: The function size in bytes.
|
|
address_unwind_info: The unwind info at each address in the function.
|
|
"""
|
|
|
|
address: int
|
|
size: int
|
|
address_unwinds: Tuple[AddressUnwind, ...]
|
|
|
|
|
|
def EncodeAddressUnwind(address_unwind: AddressUnwind) -> bytes:
|
|
"""Encodes an `AddressUnwind` object as arm unwind instructions.
|
|
|
|
Args:
|
|
address_unwind: Record representing unwind information for an address within
|
|
a function.
|
|
|
|
Returns:
|
|
A list of arm unwind instructions as bytes.
|
|
"""
|
|
if address_unwind.unwind_type == UnwindType.RETURN_TO_LR:
|
|
return EncodeAsBytes(0b10110000) # Finish.
|
|
if address_unwind.unwind_type == UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS:
|
|
return ((EncodeStackPointerUpdate(address_unwind.sp_offset)
|
|
if address_unwind.sp_offset else b'') +
|
|
(EncodePop(address_unwind.registers)
|
|
if address_unwind.registers else b''))
|
|
|
|
if address_unwind.unwind_type == UnwindType.RESTORE_SP_FROM_REGISTER:
|
|
assert len(address_unwind.registers) == 1
|
|
return (EncodeAsBytes(0b10010000
|
|
| address_unwind.registers[0] # Set vsp = r[nnnn].
|
|
) +
|
|
(EncodeStackPointerUpdate(address_unwind.sp_offset)
|
|
if address_unwind.sp_offset else b''))
|
|
|
|
if address_unwind.unwind_type == UnwindType.NO_ACTION:
|
|
return b''
|
|
|
|
assert False, 'unknown unwind type'
|
|
return b''
|
|
|
|
|
|
class UnwindInstructionsParser(abc.ABC):
|
|
"""Base class for parsers of breakpad unwind instruction sequences.
|
|
|
|
Provides regexes matching breakpad instruction sequences understood by the
|
|
parser, and parsing of the sequences from the regex match.
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def GetBreakpadInstructionsRegex(self) -> re.Pattern:
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
|
|
match: re.Match) -> Tuple[AddressUnwind, int]:
|
|
"""Returns the regex matching the breakpad instructions.
|
|
|
|
Args:
|
|
address_offset: Offset from function start address.
|
|
cfa_sp_offset: CFA stack pointer offset.
|
|
|
|
Returns:
|
|
The unwind info for the address plus the new cfa_sp_offset.
|
|
"""
|
|
|
|
|
|
class NullParser(UnwindInstructionsParser):
|
|
"""Translates the state before any instruction has been executed."""
|
|
|
|
regex = re.compile(r'^\.cfa: sp 0 \+ \.ra: lr$')
|
|
|
|
def GetBreakpadInstructionsRegex(self) -> re.Pattern:
|
|
return self.regex
|
|
|
|
def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
|
|
match: re.Match) -> Tuple[AddressUnwind, int]:
|
|
return AddressUnwind(address_offset, UnwindType.RETURN_TO_LR, 0, ()), 0
|
|
|
|
|
|
class PushOrSubSpParser(UnwindInstructionsParser):
|
|
"""Translates unwinds from push or sub sp, #constant instructions."""
|
|
|
|
# We expect at least one of the three outer groups to be non-empty. Cases:
|
|
#
|
|
# Standard prologue pushes.
|
|
# Match the first two and optionally the third.
|
|
#
|
|
# Standard prologue sub sp, #constant.
|
|
# Match only the first.
|
|
#
|
|
# Pushes in dynamic stack allocation functions after saving sp.
|
|
# Match only the third since they don't alter the stack pointer or store the
|
|
# return address.
|
|
#
|
|
# Leaf functions that use callee-save registers.
|
|
# Match the first and third but not the second.
|
|
regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ ?)?'
|
|
r'(?:\.ra: \.cfa (-\d+) \+ \^ ?)?'
|
|
r'((?:r\d+: \.cfa -\d+ \+ \^ ?)*)$')
|
|
|
|
# 'r' followed by digits, with 'r' matched via positive lookbehind so only the
|
|
# number appears in the match.
|
|
register_regex = re.compile('(?<=r)(\d+)')
|
|
|
|
def GetBreakpadInstructionsRegex(self) -> re.Pattern:
|
|
return self.regex
|
|
|
|
def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
|
|
match: re.Match) -> Tuple[AddressUnwind, int]:
|
|
# The group will be None if the outer non-capturing groups for the(\d+) and
|
|
# (-\d+) expressions are not matched.
|
|
new_cfa_sp_offset, ra_cfa_offset = (int(group) if group else None
|
|
for group in match.groups()[:2])
|
|
|
|
# Registers are pushed in reverse order by register number so are popped in
|
|
# order. Sort them to ensure the proper order.
|
|
registers = sorted([
|
|
int(register)
|
|
for register in self.register_regex.findall(match.group(3))
|
|
# `UpdateSpAndOrPopRegisters` only supports popping of register
|
|
# r4 ~ r15. The ignored registers are translated to sp increments by
|
|
# the following calculation on `sp_offset`.
|
|
if int(register) in range(4, 16)
|
|
] +
|
|
# Also pop lr (ra in breakpad terms) if it was stored.
|
|
([14] if ra_cfa_offset is not None else []))
|
|
|
|
sp_offset = 0
|
|
if new_cfa_sp_offset is not None:
|
|
sp_offset = new_cfa_sp_offset - cfa_sp_offset
|
|
assert sp_offset % 4 == 0
|
|
if sp_offset >= len(registers) * 4:
|
|
# Handles the sub sp, #constant case, and push instructions that push
|
|
# caller-save registers r0-r3 which don't get encoded in the unwind
|
|
# instructions. In the latter case we need to move the stack pointer up
|
|
# to the first pushed register.
|
|
sp_offset -= len(registers) * 4
|
|
|
|
return AddressUnwind(address_offset,
|
|
UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset,
|
|
tuple(registers)), new_cfa_sp_offset or cfa_sp_offset
|
|
|
|
|
|
class VPushParser(UnwindInstructionsParser):
|
|
# VPushes that occur in dynamic stack allocation functions after storing the
|
|
# stack pointer don't change the stack pointer or push any register that we
|
|
# care about. The first group will not match in those cases.
|
|
#
|
|
# Breakpad doesn't seem to understand how to name the floating point
|
|
# registers so calls them unnamed_register.
|
|
regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ )?'
|
|
r'(?:unnamed_register\d+: \.cfa -\d+ \+ \^ ?)+$')
|
|
|
|
def GetBreakpadInstructionsRegex(self) -> re.Pattern:
|
|
return self.regex
|
|
|
|
def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
|
|
match: re.Match) -> Tuple[AddressUnwind, int]:
|
|
# `match.group(1)`, which corresponds to the (\d+) expression, will be None
|
|
# if the first outer non-capturing group is not matched.
|
|
new_cfa_sp_offset = int(match.group(1)) if match.group(1) else None
|
|
if new_cfa_sp_offset is None:
|
|
return (AddressUnwind(address_offset, UnwindType.NO_ACTION, 0,
|
|
()), cfa_sp_offset)
|
|
|
|
sp_offset = new_cfa_sp_offset - cfa_sp_offset
|
|
assert sp_offset % 4 == 0
|
|
return AddressUnwind(address_offset,
|
|
UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset,
|
|
()), new_cfa_sp_offset
|
|
|
|
|
|
class StoreSpParser(UnwindInstructionsParser):
|
|
regex = re.compile(r'^\.cfa: r(\d+) (\d+) \+$')
|
|
|
|
def GetBreakpadInstructionsRegex(self) -> re.Pattern:
|
|
return self.regex
|
|
|
|
def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
|
|
match: re.Match) -> Tuple[AddressUnwind, int]:
|
|
register = int(match.group(1))
|
|
new_cfa_sp_offset = int(match.group(2))
|
|
sp_offset = new_cfa_sp_offset - cfa_sp_offset
|
|
assert sp_offset % 4 == 0
|
|
return AddressUnwind(address_offset, UnwindType.RESTORE_SP_FROM_REGISTER,
|
|
sp_offset, (register, )), new_cfa_sp_offset
|
|
|
|
|
|
def EncodeUnwindInstructionTable(complete_instruction_sequences: Iterable[bytes]
|
|
) -> Tuple[bytes, Dict[bytes, int]]:
|
|
"""Encodes the unwind instruction table.
|
|
|
|
Deduplicates the encoded unwind instruction sequences. Generates the table and
|
|
a dictionary mapping a function to its starting index in the table.
|
|
|
|
The instruction table is used by the unwinder to provide the sequence of
|
|
unwind instructions to execute for each function, separated by offset
|
|
into the function.
|
|
|
|
Args:
|
|
complete_instruction_sequences: An iterable of encoded unwind instruction
|
|
sequences. The sequences represent the series of unwind instructions to
|
|
execute corresponding to offsets within each function.
|
|
|
|
Returns:
|
|
A tuple containing:
|
|
- The unwind instruction table as bytes.
|
|
- The mapping from the instruction sequence to the offset in the unwind
|
|
instruction table. This mapping is used to construct the function offset
|
|
table, which references entries in the unwind instruction table.
|
|
"""
|
|
# As the function offset table uses variable length number encoding (uleb128),
|
|
# which means smaller number uses fewer bytes to represent, we should sort
|
|
# the unwind instruction table by number of references from the function
|
|
# offset table in order to minimize the size of the function offset table.
|
|
ref_counts: Dict[bytes, int] = collections.defaultdict(int)
|
|
for sequence in complete_instruction_sequences:
|
|
ref_counts[sequence] += 1
|
|
|
|
def ComputeScore(sequence):
|
|
""" Score for each sequence is computed as ref_count / size_of_sequence.
|
|
|
|
According to greedy algorithm, items with higher value / space cost ratio
|
|
should be prioritized. Here value is bytes saved in the function offset
|
|
table, represetned by ref_count. Space cost is the space taken in the
|
|
unwind instruction table, represented by size_of_sequence.
|
|
|
|
Note: In order to ensure build-time determinism, `sequence` is also returned
|
|
to resolve sorting order when scores are the same.
|
|
"""
|
|
return ref_counts[sequence] / len(sequence), sequence
|
|
|
|
ordered_sequences = sorted(ref_counts.keys(), key=ComputeScore, reverse=True)
|
|
offsets: Dict[bytes, int] = {}
|
|
current_offset = 0
|
|
for sequence in ordered_sequences:
|
|
offsets[sequence] = current_offset
|
|
current_offset += len(sequence)
|
|
return b''.join(ordered_sequences), offsets
|
|
|
|
|
|
class EncodedAddressUnwind(NamedTuple):
|
|
"""Record representing unwind information for an address within a function.
|
|
|
|
This structure represents the same concept as `AddressUnwind`. The only
|
|
difference is that how to unwind from the address is represented as
|
|
encoded ARM unwind instructions.
|
|
|
|
Attributes:
|
|
address_offset: The offset of the address from the start address of the
|
|
function.
|
|
complete_instruction_sequence: The full ARM unwind instruction sequence to
|
|
unwind from the `address_offset`.
|
|
"""
|
|
address_offset: int
|
|
complete_instruction_sequence: bytes
|
|
|
|
|
|
def EncodeAddressUnwinds(address_unwinds: Tuple[AddressUnwind, ...]
|
|
) -> Tuple[EncodedAddressUnwind, ...]:
|
|
"""Encodes the unwind instructions and offset for the addresses within a
|
|
function.
|
|
|
|
Args:
|
|
address_unwinds: A tuple of unwind state for addresses within a function.
|
|
|
|
Returns:
|
|
The encoded unwind instructions and offsets for the addresses within a
|
|
function, ordered by decreasing offset.
|
|
"""
|
|
sorted_address_unwinds: List[AddressUnwind] = sorted(
|
|
address_unwinds,
|
|
key=lambda address_unwind: address_unwind.address_offset,
|
|
reverse=True)
|
|
unwind_instructions: List[bytes] = [
|
|
EncodeAddressUnwind(address_unwind)
|
|
for address_unwind in sorted_address_unwinds
|
|
]
|
|
|
|
# A complete instruction sequence contains all the unwind instructions
|
|
# necessary to unwind from an offset within a function. For a given offset
|
|
# this includes the offset's instructions plus the instructions for all
|
|
# earlier offsets. The offsets are stored in reverse order, hence the i:
|
|
# range rather than :i+1.
|
|
complete_instruction_sequences = [
|
|
b''.join(unwind_instructions[i:]) for i in range(len(unwind_instructions))
|
|
]
|
|
|
|
encoded_unwinds: List[EncodedAddressUnwind] = []
|
|
for address_unwind, sequence in zip(sorted_address_unwinds,
|
|
complete_instruction_sequences):
|
|
encoded_unwinds.append(
|
|
EncodedAddressUnwind(address_unwind.address_offset, sequence))
|
|
return tuple(encoded_unwinds)
|
|
|
|
|
|
class EncodedFunctionUnwind(NamedTuple):
|
|
"""Record representing unwind information for a function.
|
|
|
|
This structure represents the same concept as `FunctionUnwind`, but with
|
|
some differences:
|
|
- Attribute `address` is split into 2 attributes: `page_number` and
|
|
`page_offset`.
|
|
- Attribute `size` is dropped.
|
|
- Attribute `address_unwinds` becomes a collection of `EncodedAddressUnwind`s,
|
|
instead of a collection of `AddressUnwind`s.
|
|
|
|
Attributes:
|
|
page_number: The upper bits (17 ~ 31bits) of byte offset from text section
|
|
start.
|
|
page_offset: The lower bits (1 ~ 16bits) of instruction offset from text
|
|
section start.
|
|
address_unwinds: A collection of `EncodedAddressUnwind`s.
|
|
|
|
"""
|
|
|
|
page_number: int
|
|
page_offset: int
|
|
address_unwinds: Tuple[EncodedAddressUnwind, ...]
|
|
|
|
|
|
# The trivial unwind is defined as a single `RETURN_TO_LR` instruction
|
|
# at the start of the function.
|
|
TRIVIAL_UNWIND: Tuple[EncodedAddressUnwind, ...] = EncodeAddressUnwinds(
|
|
(AddressUnwind(address_offset=0,
|
|
unwind_type=UnwindType.RETURN_TO_LR,
|
|
sp_offset=0,
|
|
registers=()), ))
|
|
|
|
# The refuse to unwind filler unwind is used to fill the invalid space
|
|
# before the first function in the first page and after the last function
|
|
# in the last page.
|
|
REFUSE_TO_UNWIND: Tuple[EncodedAddressUnwind, ...] = (EncodedAddressUnwind(
|
|
address_offset=0,
|
|
complete_instruction_sequence=bytes([0b10000000, 0b00000000])), )
|
|
|
|
|
|
def EncodeFunctionUnwinds(function_unwinds: Iterable[FunctionUnwind],
|
|
text_section_start_address: int
|
|
) -> Iterable[EncodedFunctionUnwind]:
|
|
"""Encodes the unwind state for all functions defined in the binary.
|
|
|
|
This function
|
|
- sorts the collection of `FunctionUnwind`s by address.
|
|
- fills in gaps between functions with trivial unwind.
|
|
- fills the space in the last page after last function with refuse to unwind.
|
|
- fills the space in the first page before the first function with refuse
|
|
to unwind.
|
|
|
|
Args:
|
|
function_unwinds: An iterable of function unwind states.
|
|
text_section_start_address: The address of .text section in ELF file.
|
|
|
|
Returns:
|
|
The encoded function unwind states with no gaps between functions, ordered
|
|
by ascending address.
|
|
"""
|
|
|
|
def GetPageNumber(address: int) -> int:
|
|
"""Calculates the page number.
|
|
|
|
Page number is calculated as byte_offset_from_text_section_start >> 17,
|
|
i.e. the upper bits (17 ~ 31bits) of byte offset from text section start.
|
|
"""
|
|
return (address - text_section_start_address) >> 17
|
|
|
|
def GetPageOffset(address: int) -> int:
|
|
"""Calculates the page offset.
|
|
|
|
Page offset is calculated as (byte_offset_from_text_section_start >> 1)
|
|
& 0xffff, i.e. the lower bits (1 ~ 16bits) of instruction offset from
|
|
text section start.
|
|
"""
|
|
return ((address - text_section_start_address) >> 1) & 0xffff
|
|
|
|
sorted_function_unwinds: List[FunctionUnwind] = sorted(
|
|
function_unwinds, key=lambda function_unwind: function_unwind.address)
|
|
|
|
if sorted_function_unwinds[0].address > text_section_start_address:
|
|
yield EncodedFunctionUnwind(page_number=0,
|
|
page_offset=0,
|
|
address_unwinds=REFUSE_TO_UNWIND)
|
|
|
|
prev_func_end_address: int = sorted_function_unwinds[0].address
|
|
|
|
gaps = 0
|
|
for unwind in sorted_function_unwinds:
|
|
assert prev_func_end_address <= unwind.address, (
|
|
'Detected overlap between functions.')
|
|
|
|
if prev_func_end_address < unwind.address:
|
|
# Gaps between functions are typically filled by regions of thunks which
|
|
# do not alter the stack pointer. Filling these gaps with TRIVIAL_UNWIND
|
|
# is the appropriate unwind strategy.
|
|
gaps += 1
|
|
yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address),
|
|
GetPageOffset(prev_func_end_address),
|
|
TRIVIAL_UNWIND)
|
|
|
|
yield EncodedFunctionUnwind(GetPageNumber(unwind.address),
|
|
GetPageOffset(unwind.address),
|
|
EncodeAddressUnwinds(unwind.address_unwinds))
|
|
|
|
prev_func_end_address = unwind.address + unwind.size
|
|
|
|
if GetPageOffset(prev_func_end_address) != 0:
|
|
yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address),
|
|
GetPageOffset(prev_func_end_address),
|
|
REFUSE_TO_UNWIND)
|
|
|
|
logging.info('%d/%d gaps between functions filled with trivial unwind.', gaps,
|
|
len(sorted_function_unwinds))
|
|
|
|
|
|
def EncodeFunctionOffsetTable(
|
|
encoded_address_unwind_sequences: Iterable[
|
|
Tuple[EncodedAddressUnwind, ...]],
|
|
unwind_instruction_table_offsets: Dict[bytes, int]
|
|
) -> Tuple[bytes, Dict[Tuple[EncodedAddressUnwind, ...], int]]:
|
|
"""Encodes the function offset table.
|
|
|
|
The function offset table maps local instruction offset from function
|
|
start to the location in the unwind instruction table.
|
|
|
|
Args:
|
|
encoded_address_unwind_sequences: An iterable of encoded address unwind
|
|
sequences.
|
|
unwind_instruction_table_offsets: The offset mapping returned from
|
|
`EncodeUnwindInstructionTable`.
|
|
|
|
Returns:
|
|
A tuple containing:
|
|
- The function offset table as bytes.
|
|
- The mapping from the `EncodedAddressUnwind`s to the offset in the function
|
|
offset table. This mapping is used to construct the function table, which
|
|
references entries in the function offset table.
|
|
"""
|
|
function_offset_table = bytearray()
|
|
offsets: Dict[Tuple[EncodedAddressUnwind, ...], int] = {}
|
|
|
|
for sequence in encoded_address_unwind_sequences:
|
|
if sequence in offsets:
|
|
continue
|
|
|
|
offsets[sequence] = len(function_offset_table)
|
|
for address_offset, complete_instruction_sequence in sequence:
|
|
# Note: address_offset is the number of bytes from one address to another,
|
|
# while the instruction_offset is the number of 2-byte instructions
|
|
# from one address to another.
|
|
instruction_offset = address_offset >> 1
|
|
function_offset_table += (
|
|
Uleb128Encode(instruction_offset) + Uleb128Encode(
|
|
unwind_instruction_table_offsets[complete_instruction_sequence]))
|
|
|
|
return bytes(function_offset_table), offsets
|
|
|
|
|
|
def EncodePageTableAndFunctionTable(
|
|
function_unwinds: Iterable[EncodedFunctionUnwind],
|
|
function_offset_table_offsets: Dict[Tuple[EncodedAddressUnwind, ...], int]
|
|
) -> Tuple[bytes, bytes]:
|
|
"""Encode page table and function table as bytes.
|
|
|
|
Page table:
|
|
A table that contains the mapping from page_number to the location of the
|
|
entry for the first function on the page in the function table.
|
|
|
|
Function table:
|
|
A table that contains the mapping from page_offset to the location of an entry
|
|
in the function offset table.
|
|
|
|
Args:
|
|
function_unwinds: All encoded function unwinds in the module.
|
|
function_offset_table_offsets: The offset mapping returned from
|
|
`EncodeFunctionOffsetTable`.
|
|
|
|
Returns:
|
|
A tuple containing:
|
|
- The page table as bytes.
|
|
- The function table as bytes.
|
|
"""
|
|
page_function_unwinds: Dict[
|
|
int, List[EncodedFunctionUnwind]] = collections.defaultdict(list)
|
|
for function_unwind in function_unwinds:
|
|
page_function_unwinds[function_unwind.page_number].append(function_unwind)
|
|
|
|
raw_page_table: List[int] = []
|
|
function_table = bytearray()
|
|
|
|
for page_number, same_page_function_unwinds in sorted(
|
|
page_function_unwinds.items(), key=lambda item: item[0]):
|
|
# Pad empty pages.
|
|
# Empty pages can occur when a function spans over multiple pages.
|
|
# Example:
|
|
# A page table with a starting function that spans 3 over pages.
|
|
# page_table:
|
|
# [0, 1, 1, 1]
|
|
# function_table:
|
|
# [
|
|
# # Page 0
|
|
# (0, 20) # This function spans from page 0 offset 0 to page 3 offset 5.
|
|
# # Page 1 is empty.
|
|
# # Page 2 is empty.
|
|
# # Page 3
|
|
# (6, 70)
|
|
# ]
|
|
assert page_number > len(raw_page_table) - 1
|
|
number_of_empty_pages = page_number - len(raw_page_table)
|
|
# The function table is represented as `base::FunctionTableEntry[]`,
|
|
# where `base::FunctionTableEntry` is 4 bytes.
|
|
function_table_index = len(function_table) // 4
|
|
raw_page_table.extend([function_table_index] * (number_of_empty_pages + 1))
|
|
assert page_number == len(raw_page_table) - 1
|
|
|
|
for function_unwind in sorted(
|
|
same_page_function_unwinds,
|
|
key=lambda function_unwind: function_unwind.page_offset):
|
|
function_table += struct.pack(
|
|
'HH', function_unwind.page_offset,
|
|
function_offset_table_offsets[function_unwind.address_unwinds])
|
|
|
|
page_table = struct.pack(f'{len(raw_page_table)}I', *raw_page_table)
|
|
|
|
return page_table, bytes(function_table)
|
|
|
|
|
|
ALL_PARSERS: Tuple[UnwindInstructionsParser, ...] = (
|
|
NullParser(),
|
|
PushOrSubSpParser(),
|
|
StoreSpParser(),
|
|
VPushParser(),
|
|
)
|
|
|
|
|
|
def ParseAddressCfi(address_cfi: AddressCfi, function_start_address: int,
|
|
parsers: Tuple[UnwindInstructionsParser, ...],
|
|
prev_cfa_sp_offset: int
|
|
) -> Tuple[Union[AddressUnwind, None], bool, int]:
|
|
"""Parses address CFI with given parsers.
|
|
|
|
Args:
|
|
address_cfi: The CFI for an address in the function.
|
|
function_start_address: The start address of the function.
|
|
parsers: Available parsers to try on CFI data.
|
|
prev_cfa_sp_offset: Previous CFA stack pointer offset.
|
|
|
|
Returns:
|
|
A tuple containing:
|
|
- An `AddressUnwind` object when the parse is successful, None otherwise.
|
|
- Whether the address is in function epilogue.
|
|
- The new cfa_sp_offset.
|
|
"""
|
|
for parser in parsers:
|
|
match = parser.GetBreakpadInstructionsRegex().search(
|
|
address_cfi.unwind_instructions)
|
|
if not match:
|
|
continue
|
|
|
|
address_unwind, cfa_sp_offset = parser.ParseFromMatch(
|
|
address_cfi.address - function_start_address, prev_cfa_sp_offset, match)
|
|
|
|
in_epilogue = (
|
|
prev_cfa_sp_offset > cfa_sp_offset
|
|
and address_unwind.unwind_type != UnwindType.RESTORE_SP_FROM_REGISTER)
|
|
|
|
return (address_unwind if not in_epilogue else None, in_epilogue,
|
|
cfa_sp_offset)
|
|
|
|
return None, False, prev_cfa_sp_offset
|
|
|
|
|
|
def GenerateUnwinds(function_cfis: Iterable[FunctionCfi],
|
|
parsers: Tuple[UnwindInstructionsParser, ...]
|
|
) -> Iterable[FunctionUnwind]:
|
|
"""Generates parsed function unwind states from breakpad CFI data.
|
|
|
|
This function parses `FunctionCfi`s to `FunctionUnwind`s using
|
|
`UnwindInstructionParser`.
|
|
|
|
Args:
|
|
function_cfis: An iterable of function CFI data.
|
|
parsers: Available parsers to try on CFI address data.
|
|
|
|
Returns:
|
|
An iterable of parsed function unwind states.
|
|
"""
|
|
functions = 0
|
|
addresses = 0
|
|
handled_addresses = 0
|
|
epilogues_seen = 0
|
|
|
|
for function_cfi in function_cfis:
|
|
functions += 1
|
|
address_unwinds: List[AddressUnwind] = []
|
|
cfa_sp_offset = 0
|
|
for address_cfi in function_cfi.address_cfi:
|
|
addresses += 1
|
|
|
|
address_unwind, in_epilogue, cfa_sp_offset = ParseAddressCfi(
|
|
address_cfi, function_cfi.address_cfi[0].address, parsers,
|
|
cfa_sp_offset)
|
|
|
|
if address_unwind:
|
|
handled_addresses += 1
|
|
address_unwinds.append(address_unwind)
|
|
continue
|
|
|
|
if in_epilogue:
|
|
epilogues_seen += 1
|
|
break
|
|
|
|
logging.info('unrecognized CFI: %x %s.', address_cfi.address,
|
|
address_cfi.unwind_instructions)
|
|
|
|
if address_unwinds:
|
|
# We expect that the unwind information for every function starts with a
|
|
# trivial unwind (RETURN_TO_LR) prior to the execution of any code in the
|
|
# function. This is required by the arm calling convention which involves
|
|
# setting lr to the return address on calling into a function.
|
|
assert address_unwinds[0].address_offset == 0
|
|
assert address_unwinds[0].unwind_type == UnwindType.RETURN_TO_LR
|
|
|
|
yield FunctionUnwind(function_cfi.address_cfi[0].address,
|
|
function_cfi.size, tuple(address_unwinds))
|
|
|
|
logging.info('%d functions.', functions)
|
|
logging.info('%d/%d addresses handled.', handled_addresses, addresses)
|
|
logging.info('epilogues_seen: %d.', epilogues_seen)
|
|
|
|
|
|
def EncodeUnwindInfo(page_table: bytes, function_table: bytes,
|
|
function_offset_table: bytes,
|
|
unwind_instruction_table: bytes) -> bytes:
|
|
"""Encodes all unwind tables as a single binary.
|
|
|
|
Concats all unwind table binaries together and attach a header at the start
|
|
with a offset-size pair for each table.
|
|
|
|
offset: The offset to the target table from the start of the unwind info
|
|
binary in bytes.
|
|
size: The declared size of the target table.
|
|
|
|
Both offset and size are represented as 32bit integers.
|
|
See `base::ChromeUnwindInfoHeaderAndroid` for more details.
|
|
|
|
Args:
|
|
page_table: The page table as bytes.
|
|
function_table: The function table as bytes.
|
|
function_offset_table: The function offset table as bytes.
|
|
unwind_instruction_table: The unwind instruction table as bytes.
|
|
|
|
Returns:
|
|
A single binary containing
|
|
- A header that points to the location of each table.
|
|
- All unwind tables.
|
|
"""
|
|
unwind_info_header = bytearray()
|
|
# Each table is represented as (offset, size) pair, both offset and size
|
|
# are represented as 4 byte integer.
|
|
unwind_info_header_size = 4 * 2 * 4
|
|
unwind_info_body = bytearray()
|
|
|
|
# Both the page_table and the function table need to be aligned because their
|
|
# contents are interpreted as multi-byte integers. However, the byte size of
|
|
# the header, the page table, the function table are all multiples of 4 and
|
|
# the resource will be memory mapped at a 4 byte boundary, so no extra care
|
|
# is required to align the page table and the function table.
|
|
#
|
|
# The function offset table and the unwind instruction table are accessed
|
|
# byte by byte, so they only need 1 byte alignment.
|
|
|
|
assert len(page_table) % 4 == 0, (
|
|
'Each entry in the page table should be 4-byte integer.')
|
|
assert len(function_table) % 4 == 0, (
|
|
'Each entry in the function table should be a pair of 2 2-byte integers.')
|
|
|
|
for table in page_table, function_table:
|
|
offset = unwind_info_header_size + len(unwind_info_body)
|
|
# For the page table and the function_table, declared size is the number of
|
|
# entries in each table. The tables will be aligned to a 4 byte boundary
|
|
# because the resource will be memory mapped at a 4 byte boundary and the
|
|
# header is a multiple of 4 bytes.
|
|
declared_size = len(table) // 4
|
|
unwind_info_header += struct.pack('II', offset, declared_size)
|
|
unwind_info_body += table
|
|
|
|
for table in function_offset_table, unwind_instruction_table:
|
|
offset = unwind_info_header_size + len(unwind_info_body)
|
|
# Because both the function offset table and the unwind instruction table
|
|
# contain variable length encoded numbers, the declared size is simply the
|
|
# number of bytes in each table. The tables only require 1 byte alignment.
|
|
declared_size = len(table)
|
|
unwind_info_header += struct.pack('II', offset, declared_size)
|
|
unwind_info_body += table
|
|
|
|
return bytes(unwind_info_header + unwind_info_body)
|
|
|
|
|
|
def GenerateUnwindTables(
|
|
encoded_function_unwinds_iterable: Iterable[EncodedFunctionUnwind]
|
|
) -> Tuple[bytes, bytes, bytes, bytes]:
|
|
"""Generates all unwind tables as bytes.
|
|
|
|
Args:
|
|
encoded_function_unwinds_iterable: Encoded function unwinds for all
|
|
functions in the ELF binary.
|
|
|
|
Returns:
|
|
A tuple containing:
|
|
- The page table as bytes.
|
|
- The function table as bytes.
|
|
- The function offset table as bytes.
|
|
- The unwind instruction table as bytes.
|
|
"""
|
|
encoded_function_unwinds: List[EncodedFunctionUnwind] = list(
|
|
encoded_function_unwinds_iterable)
|
|
complete_instruction_sequences: List[bytes] = []
|
|
encoded_address_unwind_sequences: List[Tuple[EncodedAddressUnwind, ...]] = []
|
|
|
|
for encoded_function_unwind in encoded_function_unwinds:
|
|
encoded_address_unwind_sequences.append(
|
|
encoded_function_unwind.address_unwinds)
|
|
for address_unwind in encoded_function_unwind.address_unwinds:
|
|
complete_instruction_sequences.append(
|
|
address_unwind.complete_instruction_sequence)
|
|
|
|
unwind_instruction_table, unwind_instruction_table_offsets = (
|
|
EncodeUnwindInstructionTable(complete_instruction_sequences))
|
|
|
|
function_offset_table, function_offset_table_offsets = (
|
|
EncodeFunctionOffsetTable(encoded_address_unwind_sequences,
|
|
unwind_instruction_table_offsets))
|
|
|
|
page_table, function_table = EncodePageTableAndFunctionTable(
|
|
encoded_function_unwinds, function_offset_table_offsets)
|
|
|
|
return (page_table, function_table, function_offset_table,
|
|
unwind_instruction_table)
|
|
|
|
|
|
def ReadTextSectionStartAddress(readobj_path: str, libchrome_path: str) -> int:
|
|
"""Reads the .text section start address of libchrome ELF.
|
|
|
|
Arguments:
|
|
readobj_path: Path to llvm-obj binary.
|
|
libchrome_path: Path to libchrome binary.
|
|
|
|
Returns:
|
|
The text section start address as a number.
|
|
"""
|
|
def GetSectionName(section) -> str:
|
|
# See crbug.com/1426287 for context on different JSON names.
|
|
if 'Name' in section['Section']['Name']:
|
|
return section['Section']['Name']['Name']
|
|
return section['Section']['Name']['Value']
|
|
|
|
proc = subprocess.Popen(
|
|
[readobj_path, '--sections', '--elf-output-style=JSON', libchrome_path],
|
|
stdout=subprocess.PIPE,
|
|
encoding='ascii')
|
|
|
|
elfs = json.loads(proc.stdout.read())[0]
|
|
sections = elfs['Sections']
|
|
|
|
return next(s['Section']['Address'] for s in sections
|
|
if GetSectionName(s) == '.text')
|
|
|
|
|
|
def main():
|
|
build_utils.InitLogging('CREATE_UNWIND_TABLE_DEBUG')
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument('--input_path',
|
|
help='Path to the unstripped binary.',
|
|
required=True,
|
|
metavar='FILE')
|
|
parser.add_argument('--output_path',
|
|
help='Path to unwind info binary output.',
|
|
required=True,
|
|
metavar='FILE')
|
|
parser.add_argument('--dump_syms_path',
|
|
required=True,
|
|
help='The path of the dump_syms binary.',
|
|
metavar='FILE')
|
|
parser.add_argument('--readobj_path',
|
|
required=True,
|
|
help='The path of the llvm-readobj binary.',
|
|
metavar='FILE')
|
|
|
|
args = parser.parse_args()
|
|
proc = subprocess.Popen(['./' + args.dump_syms_path, args.input_path, '-v'],
|
|
stdout=subprocess.PIPE,
|
|
encoding='ascii')
|
|
|
|
function_cfis = ReadFunctionCfi(proc.stdout)
|
|
function_unwinds = GenerateUnwinds(function_cfis, parsers=ALL_PARSERS)
|
|
encoded_function_unwinds = EncodeFunctionUnwinds(
|
|
function_unwinds,
|
|
ReadTextSectionStartAddress(args.readobj_path, args.input_path))
|
|
(page_table, function_table, function_offset_table,
|
|
unwind_instruction_table) = GenerateUnwindTables(encoded_function_unwinds)
|
|
unwind_info: bytes = EncodeUnwindInfo(page_table, function_table,
|
|
function_offset_table,
|
|
unwind_instruction_table)
|
|
|
|
if proc.wait():
|
|
logging.critical('dump_syms exited with return code %d', proc.returncode)
|
|
sys.exit(proc.returncode)
|
|
|
|
with open(args.output_path, 'wb') as f:
|
|
f.write(unwind_info)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|