# Licensed under a 3-clause BSD style license - see LICENSE.rst
import itertools
import math
import tabulate
from asv_runner.console import color_print
from asv_runner.statistics import get_err
from .. import _stats, util
from ..benchmarks import Benchmarks
from ..console import log
from ..environment import get_environments
from ..machine import iter_machine_files
from ..repo import NoSuchNameError, get_repo
from ..results import iter_results_for_machine_and_hash
from ..util import human_value, load_json
from . import Command, common_args
[docs]
def mean(values):
values = [value for value in values if value is not None]
if values:
return sum(values) / float(len(values))
else:
return None
[docs]
def unroll_result(benchmark_name, params, *values):
"""
Iterate through parameterized result values
Yields
------
name
Strings of the form "benchmark_name(value1, value2)" with
parameter values substituted in. For non-parameterized
results, simply the benchmark name.
value
Benchmark timing or other scalar value.
"""
num_comb = 1
for p in params:
num_comb *= len(p)
values = list(values)
for j in range(len(values)):
if values[j] is None:
values[j] = [None] * num_comb
for params, value in zip(itertools.product(*params), zip(*values)):
if params == ():
name = benchmark_name
else:
name = f"{benchmark_name}({', '.join(params)})"
yield (name,) + value
[docs]
def _isna(value):
# None (failed) or NaN (skipped)
return value is None or value != value
[docs]
def _is_result_better(a, b, a_ss, b_ss, factor, use_stats=True):
"""
Check if result 'a' is better than 'b' by the given factor,
possibly taking confidence intervals into account.
"""
if (
use_stats
and a_ss
and b_ss
and a_ss[0]
and b_ss[0]
and (a_ss[0].get('repeat', 0) != 1 and b_ss[0].get('repeat', 0) != 1)
):
# Return False if estimates don't differ.
#
# Special-case the situation with only one sample, in which
# case we do the comparison only based on `factor` as there's
# not enough data to do statistics.
if not _stats.is_different(a_ss[1], b_ss[1], a_ss[0], b_ss[0]):
return False
return a < b / factor
[docs]
class Compare(Command):
@classmethod
[docs]
def setup_arguments(cls, subparsers):
parser = subparsers.add_parser(
"compare",
help="""Compare the benchmark results between two revisions
(averaged over configurations)""",
description="Compare two sets of results",
)
parser.add_argument('revision1', help="""The reference revision.""")
parser.add_argument('revision2', help="""The revision being compared.""")
common_args.add_compare(parser, sort_default='default', only_changed_default=False)
parser.add_argument(
'--machine',
'-m',
type=str,
default=None,
help="""The machine to compare the revisions for.""",
)
common_args.add_environment(parser)
parser.set_defaults(func=cls.run_from_args)
return parser
@classmethod
[docs]
def run_from_conf_args(cls, conf, args):
return cls.run(
conf=conf,
hash_1=args.revision1,
hash_2=args.revision2,
factor=args.factor,
split=args.split,
only_changed=args.only_changed,
sort=args.sort,
machine=args.machine,
env_spec=args.env_spec,
use_stats=args.use_stats,
)
@classmethod
[docs]
def run(
cls,
conf,
hash_1,
hash_2,
factor=None,
split=False,
only_changed=False,
sort='default',
machine=None,
env_spec=None,
use_stats=True,
):
repo = get_repo(conf)
try:
hash_1 = repo.get_hash_from_name(hash_1)
except NoSuchNameError:
pass
try:
hash_2 = repo.get_hash_from_name(hash_2)
except NoSuchNameError:
pass
if env_spec:
env_names = [
env.name for env in get_environments(conf, env_spec, verbose=False)
] + list(env_spec)
else:
env_names = None
machines = []
for path in iter_machine_files(conf.results_dir):
d = load_json(path)
machines.append(d['machine'])
if len(machines) == 0:
raise util.UserError("No results found")
elif machine is None:
if len(machines) > 1:
raise util.UserError(
"Results available for several machines: {} - "
"specify which one to use with the --machine option".format('/'.join(machines))
)
else:
machine = machines[0]
elif machine not in machines:
raise util.UserError(f"Results for machine '{machine} not found")
commit_names = {
hash_1: repo.get_name_from_hash(hash_1),
hash_2: repo.get_name_from_hash(hash_2),
}
cls.print_table(
conf,
hash_1,
hash_2,
factor=factor,
split=split,
use_stats=use_stats,
only_changed=only_changed,
sort=sort,
machine=machine,
env_names=env_names,
commit_names=commit_names,
)
@classmethod
[docs]
def print_table(
cls,
conf,
hash_1,
hash_2,
factor,
split,
resultset_1=None,
resultset_2=None,
machine=None,
only_changed=False,
sort='default',
use_stats=True,
env_names=None,
commit_names=None,
):
results_1 = {}
results_2 = {}
ss_1 = {}
ss_2 = {}
versions_1 = {}
versions_2 = {}
units = {}
benchmarks = Benchmarks.load(conf)
if commit_names is None:
commit_names = {}
def results_default_iter(commit_hash):
for result in iter_results_for_machine_and_hash(
conf.results_dir, machine, commit_hash
):
if env_names is not None and result.env_name not in env_names:
continue
for key in result.get_all_result_keys():
params = result.get_result_params(key)
result_value = result.get_result_value(key, params)
result_stats = result.get_result_stats(key, params)
result_samples = result.get_result_samples(key, params)
result_version = result.benchmark_version.get(key)
yield (
key,
params,
result_value,
result_stats,
result_samples,
result_version,
result.params['machine'],
result.env_name,
)
if resultset_1 is None:
resultset_1 = results_default_iter(hash_1)
if resultset_2 is None:
resultset_2 = results_default_iter(hash_2)
machine_env_names = set()
for key, params, value, stats, samples, version, machine, env_name in resultset_1:
machine_env_name = f"{machine}/{env_name}"
machine_env_names.add(machine_env_name)
for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
results_1[(name, machine_env_name)] = value
ss_1[(name, machine_env_name)] = (stats, samples)
versions_1[(name, machine_env_name)] = version
for key, params, value, stats, samples, version, machine, env_name in resultset_2:
machine_env_name = f"{machine}/{env_name}"
machine_env_names.add(machine_env_name)
for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
results_2[(name, machine_env_name)] = value
ss_2[(name, machine_env_name)] = (stats, samples)
versions_2[(name, machine_env_name)] = version
if len(results_1) == 0:
raise util.UserError(f"Did not find results for commit {hash_1}")
if len(results_2) == 0:
raise util.UserError(f"Did not find results for commit {hash_2}")
benchmarks_1 = set(results_1.keys())
benchmarks_2 = set(results_2.keys())
joint_benchmarks = sorted(benchmarks_1 | benchmarks_2)
bench = {}
if split:
bench['green'] = []
bench['red'] = []
bench['lightgrey'] = []
bench['default'] = []
else:
bench['all'] = []
worsened = False
improved = False
for benchmark in joint_benchmarks:
if benchmark in results_1:
time_1 = results_1[benchmark]
else:
time_1 = math.nan
if benchmark in results_2:
time_2 = results_2[benchmark]
else:
time_2 = math.nan
if benchmark in ss_1 and ss_1[benchmark][0]:
err_1 = get_err(time_1, ss_1[benchmark][0])
else:
err_1 = None
if benchmark in ss_2 and ss_2[benchmark][0]:
err_2 = get_err(time_2, ss_2[benchmark][0])
else:
err_2 = None
version_1 = versions_1.get(benchmark)
version_2 = versions_2.get(benchmark)
if _isna(time_1) or _isna(time_2):
ratio = 'n/a'
ratio_num = 1e9
else:
try:
ratio_num = time_2 / time_1
ratio = f"{ratio_num:6.2f}"
except ZeroDivisionError:
ratio_num = 1e9
ratio = "n/a"
if version_1 is not None and version_2 is not None and version_1 != version_2:
# not comparable
color = 'lightgrey'
mark = 'x'
elif time_1 is not None and time_2 is None:
# introduced a failure
color = 'red'
mark = '!'
worsened = True
elif time_1 is None and time_2 is not None:
# fixed a failure
color = 'green'
mark = '*'
improved = True
elif time_1 is None and time_2 is None:
# both failed
color = 'default'
mark = ' '
elif _isna(time_1) or _isna(time_2):
# either one was skipped
color = 'default'
mark = ' '
elif _is_result_better(
time_2,
time_1,
ss_2.get(benchmark),
ss_1.get(benchmark),
factor,
use_stats=use_stats,
):
color = 'green'
mark = '-'
improved = True
elif _is_result_better(
time_1,
time_2,
ss_1.get(benchmark),
ss_2.get(benchmark),
factor,
use_stats=use_stats,
):
color = 'red'
mark = '+'
worsened = True
else:
color = 'default'
mark = ' '
# Mark statistically insignificant results
if _is_result_better(time_1, time_2, None, None, factor) or _is_result_better(
time_2, time_1, None, None, factor
):
ratio = "~" + ratio.strip()
if only_changed and mark in (' ', 'x', '*'):
continue
unit = units[benchmark]
details = f"{mark:1s} {human_value(time_1, unit, err=err_1):>15s} {human_value(time_2, unit, err=err_2):>15s} {ratio:>8s} "
split_line = details.split()
if len(machine_env_names) > 1:
benchmark_name = "{} [{}]".format(*benchmark)
else:
benchmark_name = benchmark[0]
if len(split_line) == 4:
split_line += [benchmark_name]
else:
split_line = [' '] + split_line + [benchmark_name]
if split:
bench[color].append(split_line)
else:
bench['all'].append(split_line)
if split:
keys = ['green', 'default', 'red', 'lightgrey']
else:
keys = ['all']
titles = {}
titles['green'] = "Benchmarks that have improved:"
titles['default'] = "Benchmarks that have stayed the same:"
titles['red'] = "Benchmarks that have got worse:"
titles['lightgrey'] = "Benchmarks that are not comparable:"
titles['all'] = "All benchmarks:"
log.flush()
for key in keys:
if len(bench[key]) == 0:
continue
if not only_changed:
color_print("")
color_print(titles[key])
color_print("")
name_1 = commit_names.get(hash_1)
if name_1:
name_1 = f'<{name_1}>'
else:
name_1 = ''
name_2 = commit_names.get(hash_2)
if name_2:
name_2 = f'<{name_2}>'
else:
name_2 = ''
if sort == 'default':
pass
elif sort == 'ratio':
bench[key].sort(key=lambda v: v[3], reverse=True)
elif sort == 'name':
bench[key].sort(key=lambda v: v[2])
else:
raise ValueError("Unknown 'sort'")
print(
tabulate.tabulate(
bench[key],
headers=[
'Change',
f'Before [{hash_1[:8]}] {name_1}',
f'After [{hash_2[:8]}] {name_2}',
'Ratio',
'Benchmark (Parameter)',
],
tablefmt="github",
)
)
return worsened, improved