533 lines
17 KiB
Python
Executable File
533 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright 2019 The Chromium Authors
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
"""Utilities for optimistically parsing dex files.
|
|
|
|
This file is not meant to provide a generic tool for analyzing dex files.
|
|
A DexFile class that exposes access to several memory items in the dex format
|
|
is provided, but it does not include error handling or validation.
|
|
"""
|
|
|
|
import argparse
|
|
import collections
|
|
import errno
|
|
import os
|
|
import re
|
|
import struct
|
|
import sys
|
|
import zipfile
|
|
|
|
# https://source.android.com/devices/tech/dalvik/dex-format#header-item
|
|
_DEX_HEADER_FMT = (
|
|
('magic', '8s'),
|
|
('checksum', 'I'),
|
|
('signature', '20s'),
|
|
('file_size', 'I'),
|
|
('header_size', 'I'),
|
|
('endian_tag', 'I'),
|
|
('link_size', 'I'),
|
|
('link_off', 'I'),
|
|
('map_off', 'I'),
|
|
('string_ids_size', 'I'),
|
|
('string_ids_off', 'I'),
|
|
('type_ids_size', 'I'),
|
|
('type_ids_off', 'I'),
|
|
('proto_ids_size', 'I'),
|
|
('proto_ids_off', 'I'),
|
|
('field_ids_size', 'I'),
|
|
('field_ids_off', 'I'),
|
|
('method_ids_size', 'I'),
|
|
('method_ids_off', 'I'),
|
|
('class_defs_size', 'I'),
|
|
('class_defs_off', 'I'),
|
|
('data_size', 'I'),
|
|
('data_off', 'I'),
|
|
)
|
|
|
|
DexHeader = collections.namedtuple('DexHeader',
|
|
','.join(t[0] for t in _DEX_HEADER_FMT))
|
|
|
|
# Simple memory items.
|
|
_TypeIdItem = collections.namedtuple('TypeIdItem', 'descriptor_idx')
|
|
_ProtoIdItem = collections.namedtuple(
|
|
'ProtoIdItem', 'shorty_idx,return_type_idx,parameters_off')
|
|
_MethodIdItem = collections.namedtuple('MethodIdItem',
|
|
'type_idx,proto_idx,name_idx')
|
|
_TypeItem = collections.namedtuple('TypeItem', 'type_idx')
|
|
_StringDataItem = collections.namedtuple('StringItem', 'utf16_size,data')
|
|
_ClassDefItem = collections.namedtuple(
|
|
'ClassDefItem',
|
|
'class_idx,access_flags,superclass_idx,interfaces_off,source_file_idx,'
|
|
'annotations_off,class_data_off,static_values_off')
|
|
|
|
|
|
class _MemoryItemList:
|
|
"""Base class for repeated memory items."""
|
|
|
|
def __init__(self,
|
|
reader,
|
|
offset,
|
|
size,
|
|
factory,
|
|
alignment=None,
|
|
first_item_offset=None):
|
|
"""Creates the item list using the specific item factory.
|
|
|
|
Args:
|
|
reader: _DexReader used for decoding the memory item.
|
|
offset: Offset from start of the file to the item list, serving as the
|
|
key for some item types.
|
|
size: Number of memory items in the list.
|
|
factory: Function to extract each memory item from a _DexReader.
|
|
alignment: Optional integer specifying the alignment for the memory
|
|
section represented by this list.
|
|
first_item_offset: Optional, specifies a different offset to use for
|
|
extracting memory items (default is to use offset).
|
|
"""
|
|
self.offset = offset
|
|
self.size = size
|
|
reader.Seek(first_item_offset or offset)
|
|
self._items = [factory(reader) for _ in range(size)]
|
|
|
|
if alignment:
|
|
reader.AlignUpTo(alignment)
|
|
|
|
def __iter__(self):
|
|
return iter(self._items)
|
|
|
|
def __getitem__(self, key):
|
|
return self._items[key]
|
|
|
|
def __len__(self):
|
|
return len(self._items)
|
|
|
|
def __repr__(self):
|
|
item_type_part = ''
|
|
if self.size != 0:
|
|
item_type = type(self._items[0])
|
|
item_type_part = ', item type={}'.format(item_type.__name__)
|
|
|
|
return '{}(offset={:#x}, size={}{})'.format(
|
|
type(self).__name__, self.offset, self.size, item_type_part)
|
|
|
|
|
|
class _TypeIdItemList(_MemoryItemList):
|
|
def __init__(self, reader, offset, size):
|
|
factory = lambda x: _TypeIdItem(x.ReadUInt())
|
|
super().__init__(reader, offset, size, factory)
|
|
|
|
|
|
class _ProtoIdItemList(_MemoryItemList):
|
|
def __init__(self, reader, offset, size):
|
|
factory = lambda x: _ProtoIdItem(x.ReadUInt(), x.ReadUInt(), x.ReadUInt())
|
|
super().__init__(reader, offset, size, factory)
|
|
|
|
|
|
class _MethodIdItemList(_MemoryItemList):
|
|
def __init__(self, reader, offset, size):
|
|
factory = (
|
|
lambda x: _MethodIdItem(x.ReadUShort(), x.ReadUShort(), x.ReadUInt()))
|
|
super().__init__(reader, offset, size, factory)
|
|
|
|
|
|
class _StringItemList(_MemoryItemList):
|
|
def __init__(self, reader, offset, size):
|
|
reader.Seek(offset)
|
|
string_item_offsets = iter([reader.ReadUInt() for _ in range(size)])
|
|
|
|
def factory(x):
|
|
data_offset = next(string_item_offsets)
|
|
string = x.ReadString(data_offset)
|
|
return _StringDataItem(len(string), string)
|
|
|
|
super().__init__(reader, offset, size, factory)
|
|
|
|
|
|
class _TypeListItem(_MemoryItemList):
|
|
def __init__(self, reader):
|
|
offset = reader.Tell()
|
|
size = reader.ReadUInt()
|
|
factory = lambda x: _TypeItem(x.ReadUShort())
|
|
# This is necessary because we need to extract the size of the type list
|
|
# (in other cases the list size is provided in the header).
|
|
first_item_offset = reader.Tell()
|
|
super().__init__(reader,
|
|
offset,
|
|
size,
|
|
factory,
|
|
alignment=4,
|
|
first_item_offset=first_item_offset)
|
|
|
|
|
|
class _TypeListItemList(_MemoryItemList):
|
|
def __init__(self, reader, offset, size):
|
|
super().__init__(reader, offset, size, _TypeListItem)
|
|
|
|
|
|
class _ClassDefItemList(_MemoryItemList):
|
|
def __init__(self, reader, offset, size):
|
|
reader.Seek(offset)
|
|
|
|
def factory(x):
|
|
return _ClassDefItem(*(x.ReadUInt()
|
|
for _ in range(len(_ClassDefItem._fields))))
|
|
|
|
super().__init__(reader, offset, size, factory)
|
|
|
|
|
|
class _DexMapItem:
|
|
def __init__(self, reader):
|
|
self.type = reader.ReadUShort()
|
|
reader.ReadUShort()
|
|
self.size = reader.ReadUInt()
|
|
self.offset = reader.ReadUInt()
|
|
|
|
def __repr__(self):
|
|
return '_DexMapItem(type={}, size={}, offset={:#x})'.format(
|
|
self.type, self.size, self.offset)
|
|
|
|
|
|
class _DexMapList:
|
|
# Full list of type codes:
|
|
# https://source.android.com/devices/tech/dalvik/dex-format#type-codes
|
|
TYPE_TYPE_LIST = 0x1001
|
|
|
|
def __init__(self, reader, offset):
|
|
self._map = {}
|
|
reader.Seek(offset)
|
|
self._size = reader.ReadUInt()
|
|
for _ in range(self._size):
|
|
item = _DexMapItem(reader)
|
|
self._map[item.type] = item
|
|
|
|
def __getitem__(self, key):
|
|
return self._map[key]
|
|
|
|
def __contains__(self, key):
|
|
return key in self._map
|
|
|
|
def __repr__(self):
|
|
return '_DexMapList(size={}, items={})'.format(self._size, self._map)
|
|
|
|
|
|
class _DexReader:
|
|
def __init__(self, data):
|
|
self._data = data
|
|
self._pos = 0
|
|
|
|
def Seek(self, offset):
|
|
self._pos = offset
|
|
|
|
def Tell(self):
|
|
return self._pos
|
|
|
|
def ReadUByte(self):
|
|
return self._ReadData('<B')
|
|
|
|
def ReadUShort(self):
|
|
return self._ReadData('<H')
|
|
|
|
def ReadUInt(self):
|
|
return self._ReadData('<I')
|
|
|
|
def ReadString(self, data_offset):
|
|
string_length, string_offset = self._ReadULeb128(data_offset)
|
|
string_data_offset = string_offset + data_offset
|
|
return self._DecodeMUtf8(string_length, string_data_offset)
|
|
|
|
def AlignUpTo(self, align_unit):
|
|
off_by = self._pos % align_unit
|
|
if off_by:
|
|
self.Seek(self._pos + align_unit - off_by)
|
|
|
|
def ReadHeader(self):
|
|
header_fmt = '<' + ''.join(t[1] for t in _DEX_HEADER_FMT)
|
|
return DexHeader._make(struct.unpack_from(header_fmt, self._data))
|
|
|
|
def _ReadData(self, fmt):
|
|
ret = struct.unpack_from(fmt, self._data, self._pos)[0]
|
|
self._pos += struct.calcsize(fmt)
|
|
return ret
|
|
|
|
def _ReadULeb128(self, data_offset):
|
|
"""Returns a tuple of (uleb128 value, number of bytes occupied).
|
|
|
|
From DWARF3 spec: http://dwarfstd.org/doc/Dwarf3.pdf
|
|
|
|
Args:
|
|
data_offset: Location of the unsigned LEB128.
|
|
"""
|
|
value = 0
|
|
shift = 0
|
|
cur_offset = data_offset
|
|
while True:
|
|
byte = self._data[cur_offset]
|
|
cur_offset += 1
|
|
value |= (byte & 0b01111111) << shift
|
|
if (byte & 0b10000000) == 0:
|
|
break
|
|
shift += 7
|
|
|
|
return value, cur_offset - data_offset
|
|
|
|
def _DecodeMUtf8(self, string_length, offset):
|
|
"""Returns the string located at the specified offset.
|
|
|
|
See https://source.android.com/devices/tech/dalvik/dex-format#mutf-8
|
|
|
|
Ported from the Android Java implementation:
|
|
https://android.googlesource.com/platform/dalvik/+/fe107fb6e3f308ac5174ebdc5a794ee880c741d9/dx/src/com/android/dex/Mutf8.java#34
|
|
|
|
Args:
|
|
string_length: The length of the decoded string.
|
|
offset: Offset to the beginning of the string.
|
|
"""
|
|
self.Seek(offset)
|
|
ret = ''
|
|
|
|
for _ in range(string_length):
|
|
a = self.ReadUByte()
|
|
if a == 0:
|
|
raise _MUTf8DecodeError('Early string termination encountered',
|
|
string_length, offset)
|
|
if (a & 0x80) == 0x00:
|
|
code = a
|
|
elif (a & 0xe0) == 0xc0:
|
|
b = self.ReadUByte()
|
|
if (b & 0xc0) != 0x80:
|
|
raise _MUTf8DecodeError('Error in byte 2', string_length, offset)
|
|
code = ((a & 0x1f) << 6) | (b & 0x3f)
|
|
elif (a & 0xf0) == 0xe0:
|
|
b = self.ReadUByte()
|
|
c = self.ReadUByte()
|
|
if (b & 0xc0) != 0x80 or (c & 0xc0) != 0x80:
|
|
raise _MUTf8DecodeError('Error in byte 3 or 4', string_length, offset)
|
|
code = ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f)
|
|
else:
|
|
raise _MUTf8DecodeError('Bad byte', string_length, offset)
|
|
ret += chr(code)
|
|
|
|
if self.ReadUByte() != 0x00:
|
|
raise _MUTf8DecodeError('Expected string termination', string_length,
|
|
offset)
|
|
|
|
return ret
|
|
|
|
|
|
class _MUTf8DecodeError(Exception):
|
|
def __init__(self, message, length, offset):
|
|
message += ' (decoded string length: {}, string data offset: {:#x})'.format(
|
|
length, offset)
|
|
super().__init__(message)
|
|
|
|
|
|
class DexFile:
|
|
"""Represents a single dex file.
|
|
|
|
Parses and exposes access to dex file structure and contents, as described
|
|
at https://source.android.com/devices/tech/dalvik/dex-format
|
|
|
|
Fields:
|
|
reader: _DexReader object used to decode dex file contents.
|
|
header: DexHeader for this dex file.
|
|
map_list: _DexMapList object containing list of dex file contents.
|
|
type_item_list: _TypeIdItemList containing type_id_items.
|
|
proto_item_list: _ProtoIdItemList containing proto_id_items.
|
|
method_item_list: _MethodIdItemList containing method_id_items.
|
|
string_item_list: _StringItemList containing string_data_items that are
|
|
referenced by index in other sections.
|
|
type_list_item_list: _TypeListItemList containing _TypeListItems.
|
|
_TypeListItems are referenced by their offsets from other dex items.
|
|
class_def_item_list: _ClassDefItemList containing _ClassDefItems.
|
|
"""
|
|
_CLASS_ACCESS_FLAGS = {
|
|
0x1: 'public',
|
|
0x2: 'private',
|
|
0x4: 'protected',
|
|
0x8: 'static',
|
|
0x10: 'final',
|
|
0x200: 'interface',
|
|
0x400: 'abstract',
|
|
0x1000: 'synthetic',
|
|
0x2000: 'annotation',
|
|
0x4000: 'enum',
|
|
}
|
|
|
|
def __init__(self, data):
|
|
"""Decodes dex file memory sections.
|
|
|
|
Args:
|
|
data: bytearray containing the contents of a dex file.
|
|
"""
|
|
self.reader = _DexReader(data)
|
|
self.header = self.reader.ReadHeader()
|
|
self.map_list = _DexMapList(self.reader, self.header.map_off)
|
|
self.type_item_list = _TypeIdItemList(self.reader, self.header.type_ids_off,
|
|
self.header.type_ids_size)
|
|
self.proto_item_list = _ProtoIdItemList(self.reader,
|
|
self.header.proto_ids_off,
|
|
self.header.proto_ids_size)
|
|
self.method_item_list = _MethodIdItemList(self.reader,
|
|
self.header.method_ids_off,
|
|
self.header.method_ids_size)
|
|
self.string_item_list = _StringItemList(self.reader,
|
|
self.header.string_ids_off,
|
|
self.header.string_ids_size)
|
|
self.class_def_item_list = _ClassDefItemList(self.reader,
|
|
self.header.class_defs_off,
|
|
self.header.class_defs_size)
|
|
|
|
type_list_key = _DexMapList.TYPE_TYPE_LIST
|
|
if type_list_key in self.map_list:
|
|
map_list_item = self.map_list[type_list_key]
|
|
self.type_list_item_list = _TypeListItemList(self.reader,
|
|
map_list_item.offset,
|
|
map_list_item.size)
|
|
else:
|
|
self.type_list_item_list = _TypeListItemList(self.reader, 0, 0)
|
|
self._type_lists_by_offset = {
|
|
type_list.offset: type_list
|
|
for type_list in self.type_list_item_list
|
|
}
|
|
|
|
def GetString(self, string_item_idx):
|
|
string_item = self.string_item_list[string_item_idx]
|
|
return string_item.data
|
|
|
|
def GetTypeString(self, type_item_idx):
|
|
type_item = self.type_item_list[type_item_idx]
|
|
return self.GetString(type_item.descriptor_idx)
|
|
|
|
def GetTypeListStringsByOffset(self, offset):
|
|
if not offset:
|
|
return ()
|
|
type_list = self._type_lists_by_offset[offset]
|
|
return tuple(self.GetTypeString(item.type_idx) for item in type_list)
|
|
|
|
@staticmethod
|
|
def ResolveClassAccessFlags(access_flags):
|
|
return tuple(flag_string
|
|
for flag, flag_string in DexFile._CLASS_ACCESS_FLAGS.items()
|
|
if flag & access_flags)
|
|
|
|
def IterMethodSignatureParts(self):
|
|
"""Yields the string components of dex methods in a dex file.
|
|
|
|
Yields:
|
|
Tuples that look like:
|
|
(class name, return type, method name, (parameter type, ...)).
|
|
"""
|
|
for method_item in self.method_item_list:
|
|
class_name_string = self.GetTypeString(method_item.type_idx)
|
|
method_name_string = self.GetString(method_item.name_idx)
|
|
proto_item = self.proto_item_list[method_item.proto_idx]
|
|
return_type_string = self.GetTypeString(proto_item.return_type_idx)
|
|
parameter_types = self.GetTypeListStringsByOffset(
|
|
proto_item.parameters_off)
|
|
yield (class_name_string, return_type_string, method_name_string,
|
|
parameter_types)
|
|
|
|
def __repr__(self):
|
|
items = [
|
|
self.header,
|
|
self.map_list,
|
|
self.type_item_list,
|
|
self.proto_item_list,
|
|
self.method_item_list,
|
|
self.string_item_list,
|
|
self.type_list_item_list,
|
|
self.class_def_item_list,
|
|
]
|
|
return '\n'.join(str(item) for item in items)
|
|
|
|
|
|
class _DumpCommand:
|
|
def __init__(self, dexfile):
|
|
self._dexfile = dexfile
|
|
|
|
def Run(self):
|
|
raise NotImplementedError()
|
|
|
|
|
|
class _DumpMethods(_DumpCommand):
|
|
def Run(self):
|
|
for parts in self._dexfile.IterMethodSignatureParts():
|
|
class_type, return_type, method_name, parameter_types = parts
|
|
print('{} {} (return type={}, parameters={})'.format(
|
|
class_type, method_name, return_type, parameter_types))
|
|
|
|
|
|
class _DumpStrings(_DumpCommand):
|
|
def Run(self):
|
|
for string_item in self._dexfile.string_item_list:
|
|
# Some strings are likely to be non-ascii (vs. methods/classes).
|
|
print(string_item.data.encode('utf-8'))
|
|
|
|
|
|
class _DumpClasses(_DumpCommand):
|
|
def Run(self):
|
|
for class_item in self._dexfile.class_def_item_list:
|
|
class_string = self._dexfile.GetTypeString(class_item.class_idx)
|
|
superclass_string = self._dexfile.GetTypeString(class_item.superclass_idx)
|
|
interfaces = self._dexfile.GetTypeListStringsByOffset(
|
|
class_item.interfaces_off)
|
|
access_flags = DexFile.ResolveClassAccessFlags(class_item.access_flags)
|
|
print('{} (superclass={}, interfaces={}, access_flags={})'.format(
|
|
class_string, superclass_string, interfaces, access_flags))
|
|
|
|
|
|
class _DumpSummary(_DumpCommand):
|
|
def Run(self):
|
|
print(self._dexfile)
|
|
|
|
|
|
def _DumpDexItems(dexfile_data, name, item):
|
|
dexfile = DexFile(bytearray(dexfile_data))
|
|
print('dex_parser: Dumping {} for {}'.format(item, name))
|
|
cmds = {
|
|
'summary': _DumpSummary,
|
|
'methods': _DumpMethods,
|
|
'strings': _DumpStrings,
|
|
'classes': _DumpClasses,
|
|
}
|
|
try:
|
|
cmds[item](dexfile).Run()
|
|
except IOError as e:
|
|
if e.errno == errno.EPIPE:
|
|
# Assume we're piping to "less", do nothing.
|
|
pass
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Dump dex contents to stdout.')
|
|
parser.add_argument('input',
|
|
help='Input (.dex, .jar, .zip, .aab, .apk) file path.')
|
|
parser.add_argument('item',
|
|
choices=('methods', 'strings', 'classes', 'summary'),
|
|
help='Item to dump',
|
|
nargs='?',
|
|
default='summary')
|
|
args = parser.parse_args()
|
|
|
|
if os.path.splitext(args.input)[1] in ('.apk', '.jar', '.zip', '.aab'):
|
|
with zipfile.ZipFile(args.input) as z:
|
|
dex_file_paths = [
|
|
f for f in z.namelist() if re.match(r'.*classes[0-9]*\.dex$', f)
|
|
]
|
|
if not dex_file_paths:
|
|
print('Error: {} does not contain any classes.dex files'.format(
|
|
args.input))
|
|
sys.exit(1)
|
|
|
|
for path in dex_file_paths:
|
|
_DumpDexItems(z.read(path), path, args.item)
|
|
|
|
else:
|
|
with open(args.input, 'rb') as f:
|
|
_DumpDexItems(f.read(), args.input, args.item)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|