400 lines
13 KiB
Python
400 lines
13 KiB
Python
#!/usr/bin/python3
|
|
#
|
|
# Copyright (C) 2021 The Android Open Source Project
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Utilities for comparing two version of a codebase."""
|
|
|
|
import argparse
|
|
import difflib
|
|
import filecmp
|
|
import os
|
|
import pathlib
|
|
import re
|
|
|
|
|
|
class FileStat:
|
|
"""File statistics class for a file."""
|
|
|
|
NON_TEXT = 0
|
|
TEXT = 1
|
|
|
|
def __init__(self, file_path):
|
|
"""Initializes with a file path string."""
|
|
if file_path:
|
|
self.file_name = str(file_path)
|
|
self.size = file_path.stat().st_size
|
|
else:
|
|
self.file_name = ''
|
|
self.size = 0
|
|
|
|
self.line_cnt = 0
|
|
self.group_cnt = 0
|
|
self.add_line_cnt = 0
|
|
self.remove_line_cnt = 0
|
|
self.replace_line_cnt = 0
|
|
|
|
@staticmethod
|
|
def get_csv_header(prefix=None):
|
|
"""Returns CSV header string."""
|
|
cols = ['file', 'size', 'line', 'group', 'add', 'remove', 'replace']
|
|
if prefix:
|
|
return ','.join('{0}_{1}'.format(prefix, c) for c in cols)
|
|
else:
|
|
return ','.join(c for c in cols)
|
|
|
|
def get_csv_str(self, strip_dir_len=0):
|
|
"""Returns the file statistic CSV string."""
|
|
name = self.file_name[strip_dir_len:]
|
|
csv = [
|
|
FileStat.no_comma(name), self.size, self.line_cnt, self.group_cnt,
|
|
self.add_line_cnt, self.remove_line_cnt, self.replace_line_cnt
|
|
]
|
|
return ','.join(str(i) for i in csv)
|
|
|
|
@staticmethod
|
|
def no_comma(astr):
|
|
"""Replaces , with _."""
|
|
return astr.replace(',', '_')
|
|
|
|
|
|
class DiffStat:
|
|
"""Diff statistic class for 2 versions of a file."""
|
|
|
|
SAME = 0
|
|
NEW = 1
|
|
REMOVED = 2
|
|
MODIFIED = 3
|
|
INCOMPARABLE = 4
|
|
|
|
def __init__(self, common_name, old_file_stat, new_file_stat, state):
|
|
"""Initializes with the common names & etc."""
|
|
self.old_file_stat = old_file_stat
|
|
self.new_file_stat = new_file_stat
|
|
self.name = common_name
|
|
self.ext = os.path.splitext(self.name)[1].lstrip('.')
|
|
self.state = state
|
|
self.file_type = FileStat.NON_TEXT
|
|
|
|
def add_diff_stat(self, diff_lines):
|
|
"""Adds the statistic by the diff lines."""
|
|
# These align with https://github.com/python/cpython/blob/3.9/Lib/difflib.py
|
|
old_pattern = re.compile(r'\*{3} (.*)')
|
|
new_pattern = re.compile(r'-{3} (.*)')
|
|
group_separator = '***************'
|
|
old_group_header = re.compile(r'\*{3} (\d*),(\d*) \*{4}')
|
|
new_group_header = re.compile(r'-{3} (\d*),(\d*) -{4}')
|
|
|
|
# section 0 is old verion & 1 is new verion
|
|
section = -1
|
|
diff_stats = [self.old_file_stat, self.new_file_stat]
|
|
in_group = False
|
|
|
|
h1m = old_pattern.match(diff_lines[0])
|
|
if not h1m:
|
|
print('ERROR: wrong diff header line 1: %s' % diff_lines[0])
|
|
return
|
|
|
|
h2m = new_pattern.match(diff_lines[1])
|
|
if not h2m:
|
|
print('ERROR: wrong diff header line 2: %s' % diff_lines[1])
|
|
return
|
|
|
|
for line in diff_lines[2:]:
|
|
if in_group:
|
|
if line.startswith(' '):
|
|
# equal
|
|
continue
|
|
elif line.startswith('! '):
|
|
# replace
|
|
diff_stats[section].replace_line_cnt += 1
|
|
continue
|
|
elif line.startswith('+ '):
|
|
# add
|
|
diff_stats[section].add_line_cnt += 1
|
|
continue
|
|
elif line.startswith('- '):
|
|
# removed
|
|
diff_stats[section].remove_line_cnt += 1
|
|
continue
|
|
|
|
oghm = old_group_header.match(line)
|
|
if oghm:
|
|
section = 0
|
|
diff_stats[section].group_cnt += 1
|
|
continue
|
|
|
|
nghm = new_group_header.match(line)
|
|
if nghm:
|
|
section = 1
|
|
diff_stats[section].group_cnt += 1
|
|
continue
|
|
|
|
if line.startswith(group_separator):
|
|
in_group = True
|
|
continue
|
|
|
|
|
|
class ChangeReport:
|
|
"""Change report class for the diff statistics on 2 versions of a codebase.
|
|
|
|
Attributes:
|
|
old_dir: The old codebase dir path string.
|
|
new_dir: The new codebase dir path string.
|
|
dircmp: The dircmp object
|
|
group_cnt: How many diff groups.
|
|
add_line_cnt: How many lines are added.
|
|
remove_line_cnt: How many lines are removed.
|
|
replace_line_cnt: Hoe many lines are changed.
|
|
"""
|
|
|
|
def __init__(self, old_dir, new_dir, ignores=None, state_filter=None):
|
|
"""Initializes with old & new dir path strings."""
|
|
self.old_dir = os.path.abspath(old_dir)
|
|
self._old_dir_prefix_len = len(self.old_dir) + 1
|
|
self.new_dir = os.path.abspath(new_dir)
|
|
self._new_dir_prefix_len = len(self.new_dir) + 1
|
|
if ignores:
|
|
self._ignores = ignores.split(',')
|
|
self._ignores.extend(filecmp.DEFAULT_IGNORES)
|
|
else:
|
|
self._ignores = filecmp.DEFAULT_IGNORES
|
|
|
|
if state_filter:
|
|
self._state_filter = list(map(int, state_filter.split(',')))
|
|
else:
|
|
self._state_filter = [0, 1, 2, 3, 4]
|
|
|
|
self._do_same = DiffStat.SAME in self._state_filter
|
|
self._do_new = DiffStat.NEW in self._state_filter
|
|
self._do_removed = DiffStat.REMOVED in self._state_filter
|
|
self._do_moeified = DiffStat.MODIFIED in self._state_filter
|
|
self._do_incomparable = DiffStat.INCOMPARABLE in self._state_filter
|
|
|
|
self.dircmp = filecmp.dircmp(
|
|
self.old_dir, self.new_dir, ignore=self._ignores)
|
|
self._diff_stats = []
|
|
self._diff_stat_lines = []
|
|
self._diff_lines = []
|
|
self._processed_cnt = 0
|
|
self._common_dir_len = ChangeReport.get_common_path_len(
|
|
self.old_dir, self.new_dir)
|
|
|
|
@staticmethod
|
|
def get_common_path_len(dir1, dir2):
|
|
"""Gets the length of the common path of old & new folders."""
|
|
sep = os.path.sep
|
|
last_sep_pos = 0
|
|
for i in range(len(dir1)):
|
|
if dir1[i] == sep:
|
|
last_sep_pos = i
|
|
if dir1[i] != dir2[i]:
|
|
break
|
|
return last_sep_pos + 1
|
|
|
|
@staticmethod
|
|
def get_diff_stat_header():
|
|
"""Gets the diff statistic CSV header."""
|
|
return 'file,ext,text,state,{0},{1}\n'.format(
|
|
FileStat.get_csv_header('new'), FileStat.get_csv_header('old'))
|
|
|
|
def get_diff_stat_lines(self):
|
|
"""Gets the diff statistic CSV lines."""
|
|
if self._processed_cnt < 1:
|
|
self._process_dircmp(self.dircmp)
|
|
self._processed_cnt += 1
|
|
|
|
self._diff_stat_lines = []
|
|
for diff_stat in self._diff_stats:
|
|
self._diff_stat_lines.append('{0},{1},{2},{3},{4},{5}\n'.format(
|
|
FileStat.no_comma(diff_stat.name), diff_stat.ext,
|
|
diff_stat.file_type, diff_stat.state,
|
|
diff_stat.new_file_stat.get_csv_str(self._common_dir_len),
|
|
diff_stat.old_file_stat.get_csv_str(self._common_dir_len)))
|
|
|
|
return self._diff_stat_lines
|
|
|
|
def get_diff_lines(self):
|
|
"""Gets the diff output lines."""
|
|
if self._processed_cnt < 1:
|
|
self._process_dircmp(self.dircmp)
|
|
self._processed_cnt += 1
|
|
return self._diff_lines
|
|
|
|
def _process_dircmp(self, dircmp):
|
|
"""Compare all files in a dircmp object for diff statstics & output."""
|
|
if self._do_moeified:
|
|
self._process_diff_files(dircmp)
|
|
|
|
for subdir_dircmp in dircmp.subdirs.values():
|
|
rp = pathlib.Path(subdir_dircmp.right)
|
|
lp = pathlib.Path(subdir_dircmp.left)
|
|
if rp.is_symlink() or lp.is_symlink():
|
|
print('SKIP: symlink: {0} or {1}'.format(subdir_dircmp.right,
|
|
subdir_dircmp.left))
|
|
continue
|
|
self._process_dircmp(subdir_dircmp)
|
|
|
|
if self._do_new:
|
|
self._process_others(dircmp.right_only, dircmp.right,
|
|
self._new_dir_prefix_len, DiffStat.NEW)
|
|
if self._do_same:
|
|
self._process_others(dircmp.same_files, dircmp.right,
|
|
self._new_dir_prefix_len, DiffStat.SAME)
|
|
if self._do_incomparable:
|
|
self._process_others(dircmp.funny_files, dircmp.right,
|
|
self._new_dir_prefix_len, DiffStat.INCOMPARABLE)
|
|
if self._do_removed:
|
|
self._process_others(dircmp.left_only, dircmp.left,
|
|
self._old_dir_prefix_len, DiffStat.REMOVED)
|
|
|
|
def _process_others(self, files, adir, prefix_len, state):
|
|
"""Processes files are not modified."""
|
|
empty_stat = FileStat(None)
|
|
for file in files:
|
|
file_path = pathlib.Path(adir, file)
|
|
if file_path.is_symlink():
|
|
print('SKIP: symlink: {0}, {1}'.format(state, file_path))
|
|
continue
|
|
elif file_path.is_dir():
|
|
flist = self._get_filtered_files(file_path)
|
|
self._process_others(flist, adir, prefix_len, state)
|
|
else:
|
|
file_stat = FileStat(file_path)
|
|
common_name = str(file_path)[prefix_len:]
|
|
if state == DiffStat.REMOVED:
|
|
diff_stat = DiffStat(common_name, file_stat, empty_stat, state)
|
|
else:
|
|
diff_stat = DiffStat(common_name, empty_stat, file_stat, state)
|
|
try:
|
|
with open(file_path, encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
file_stat.line_cnt = len(lines)
|
|
file_type = FileStat.TEXT
|
|
except UnicodeDecodeError:
|
|
file_type = FileStat.NON_TEXT
|
|
|
|
diff_stat.file_type = file_type
|
|
self._diff_stats.append(diff_stat)
|
|
|
|
def _process_diff_files(self, dircmp):
|
|
"""Processes files are modified."""
|
|
for file in dircmp.diff_files:
|
|
old_file_path = pathlib.Path(dircmp.left, file)
|
|
new_file_path = pathlib.Path(dircmp.right, file)
|
|
self._diff_files(old_file_path, new_file_path)
|
|
|
|
def _diff_files(self, old_file_path, new_file_path):
|
|
"""Diff old & new files."""
|
|
old_file_stat = FileStat(old_file_path)
|
|
new_file_stat = FileStat(new_file_path)
|
|
common_name = str(new_file_path)[self._new_dir_prefix_len:]
|
|
diff_stat = DiffStat(common_name, old_file_stat, new_file_stat,
|
|
DiffStat.MODIFIED)
|
|
|
|
try:
|
|
with open(old_file_path, encoding='utf-8') as f1:
|
|
old_lines = f1.readlines()
|
|
old_file_stat.line_cnt = len(old_lines)
|
|
with open(new_file_path, encoding='utf-8') as f2:
|
|
new_lines = f2.readlines()
|
|
new_file_stat.line_cnt = len(new_lines)
|
|
diff_lines = list(
|
|
difflib.context_diff(old_lines, new_lines, old_file_path.name,
|
|
new_file_path.name))
|
|
file_type = FileStat.TEXT
|
|
if diff_lines:
|
|
self._diff_lines.extend(diff_lines)
|
|
diff_stat.add_diff_stat(diff_lines)
|
|
else:
|
|
print('WARNING: no diff lines on {0} {1}'.format(
|
|
old_file_path, new_file_path))
|
|
|
|
except UnicodeDecodeError:
|
|
file_type = FileStat.NON_TEXT
|
|
|
|
diff_stat.file_type = file_type
|
|
self._diff_stats.append(diff_stat)
|
|
|
|
def _get_filtered_files(self, dir_path):
|
|
"""Returns a filtered file list."""
|
|
flist = []
|
|
for f in dir_path.glob('*'):
|
|
if f.name not in self._ignores:
|
|
if f.is_symlink():
|
|
print('SKIP: symlink: %s' % f)
|
|
continue
|
|
else:
|
|
flist.append(f)
|
|
return flist
|
|
|
|
|
|
def write_file(file, lines, header=None):
|
|
"""Write lines into a file."""
|
|
|
|
with open(file, 'w') as f:
|
|
if header:
|
|
f.write(header)
|
|
|
|
f.writelines(lines)
|
|
print('OUTPUT: {0}, {1} lines'.format(file, len(lines)))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
'Generate a diff stat cvs file for 2 versions of a codebase')
|
|
parser.add_argument('--old_dir', help='the old version codebase dir')
|
|
parser.add_argument('--new_dir', help='the new version codebase dir')
|
|
parser.add_argument(
|
|
'--csv_file', required=False, help='the diff stat cvs file if to create')
|
|
parser.add_argument(
|
|
'--diff_output_file',
|
|
required=False,
|
|
help='the diff output file if to create')
|
|
parser.add_argument(
|
|
'--ignores',
|
|
required=False,
|
|
default='.repo,.git,.github,.idea,__MACOSX,.prebuilt_info',
|
|
help='names to ignore')
|
|
parser.add_argument(
|
|
'--state_filter',
|
|
required=False,
|
|
default='1,2,3',
|
|
help='csv diff states to process, 0:SAME, 1:NEW, 2:REMOVED, 3:MODIFIED, '
|
|
'4:INCOMPARABLE')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not os.path.isdir(args.old_dir):
|
|
print('ERROR: %s does not exist.' % args.old_dir)
|
|
exit()
|
|
|
|
if not os.path.isdir(args.new_dir):
|
|
print('ERROR: %s does not exist.' % args.new_dir)
|
|
exit()
|
|
|
|
change_report = ChangeReport(args.old_dir, args.new_dir, args.ignores,
|
|
args.state_filter)
|
|
if args.csv_file:
|
|
write_file(
|
|
args.csv_file,
|
|
change_report.get_diff_stat_lines(),
|
|
header=ChangeReport.get_diff_stat_header())
|
|
|
|
if args.diff_output_file:
|
|
write_file(args.diff_output_file, change_report.get_diff_lines())
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|