unplugged-system/external/cronet/testing/unexpected_passes_common/queries.py

# Copyright 2020 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Methods related to querying the ResultDB BigQuery tables."""

import json
import logging
import math
import multiprocessing.pool
import os
import subprocess
import threading
import time
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

import six

from typ import expectations_parser
from typ import json_results
from unexpected_passes_common import builders as builders_module
from unexpected_passes_common import constants
from unexpected_passes_common import data_types
from unexpected_passes_common import expectations
from unexpected_passes_common import multiprocessing_utils

DEFAULT_NUM_SAMPLES = 100
MAX_ROWS = (2**31) - 1
MAX_QUERY_TRIES = 3
# Used to prevent us from triggering too many queries simultaneously and causing
# a bunch of rate limit errors. Anything below 1.5 seemed to result in enough
# rate limit errors to cause problems. Raising above that for safety.
QUERY_DELAY = 2
# The target number of results/rows per query when running in large query mode.
# Higher values = longer individual query times and higher chances of running
# out of memory in BigQuery. Lower values = more parallelization overhead and
# more issues with rate limit errors.
TARGET_RESULTS_PER_QUERY = 20000

# Subquery for getting all try builds that were used for CL submission. 30 days
# is chosen because the ResultDB tables we pull data from only keep data around
# for 30 days.
SUBMITTED_BUILDS_TEMPLATE = """\
    SELECT
      CONCAT("build-", CAST(unnested_builds.id AS STRING)) as id
    FROM
      `commit-queue.{project_view}.attempts`,
      UNNEST(builds) as unnested_builds,
      UNNEST(gerrit_changes) as unnested_changes
    WHERE
      unnested_builds.host = "cr-buildbucket.appspot.com"
      AND unnested_changes.submit_status = "SUCCESS"
      AND start_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(),
                                     INTERVAL 30 DAY)"""

QueryResult = Dict[str, Any]
QueryParameters = Dict[str, Dict[str, Any]]

# pylint: disable=super-with-arguments,useless-object-inheritance


class BigQueryQuerier(object):
  """Class to handle all BigQuery queries for a script invocation."""

  def __init__(self, suite: Optional[str], project: str, num_samples: int,
               large_query_mode: bool, num_jobs: Optional[int]):
    """
    Args:
      suite: A string containing the name of the suite that is being queried
          for. Can be None if there is no differentiation between different
          suites.
      project: A string containing the billing project to use for BigQuery.
      num_samples: An integer containing the number of builds to pull results
          from.
      large_query_mode: A boolean indicating whether large query mode should be
          used. In this mode, an initial, smaller query is made and its results
          are used to perform additional filtering on a second, larger query in
          BigQuery. This works around hitting a hard memory limit when running
          the ORDER BY clause.
      num_jobs: An integer specifying how many jobs to run in parallel. If None,
          all jobs will be run in parallel at the same time.
    """
    self._suite = suite
    self._project = project
    self._num_samples = num_samples or DEFAULT_NUM_SAMPLES
    self._large_query_mode = large_query_mode
    self._num_jobs = num_jobs

    assert self._num_samples > 0
    assert (self._num_jobs is None or self._num_jobs > 0)

  def FillExpectationMapForBuilders(
      self, expectation_map: data_types.TestExpectationMap,
      builders: Iterable[data_types.BuilderEntry]
  ) -> Dict[str, data_types.ResultListType]:
    """Fills |expectation_map| with results from |builders|.

    Args:
      expectation_map: A data_types.TestExpectationMap. Will be modified
          in-place.
      builders: An iterable of data_types.BuilderEntry containing the builders
          to query.

    Returns:
      A dict containing any results that were retrieved that did not have a
      matching expectation in |expectation_map| in the following format:
      {
        |builder_type|:|builder_name| (str): [
          result1 (data_types.Result),
          result2 (data_types.Result),
          ...
        ],
      }
    """
    assert isinstance(expectation_map, data_types.TestExpectationMap)
    # Ensure that all the builders are of the same type since we make some
    # assumptions about that later on.
    assert builders
    builder_type = None
    for b in builders:
      if builder_type is None:
        builder_type = b.builder_type
      else:
        assert b.builder_type == builder_type

    # Filter out any builders that we can easily determine do not currently
    # produce data we care about.
    builders = self._FilterOutInactiveBuilders(builders, builder_type)

    # If we don't have an explicit number of jobs set, spin up a separate
    # process for each query/add step. This is wasteful in the sense that we'll
    # have a bunch of idle processes once faster steps start finishing, but
    # ensures that we start slow queries early and avoids the overhead of
    # passing large amounts of data between processes. See crbug.com/1182459 for
    # more information on performance considerations.
    num_jobs = self._num_jobs or len(builders)
    process_pool = multiprocessing_utils.GetProcessPool(nodes=num_jobs)

    args = [(b, expectation_map) for b in builders]

    results = process_pool.map(self._QueryAddCombined, args)

    tmp_expectation_map = data_types.TestExpectationMap()
    all_unmatched_results = {}

    for (unmatched_results, prefixed_builder_name, merge_map) in results:
      tmp_expectation_map.Merge(merge_map, expectation_map)
      if unmatched_results:
        all_unmatched_results[prefixed_builder_name] = unmatched_results

    expectation_map.clear()
    expectation_map.update(tmp_expectation_map)

    return all_unmatched_results

  def _FilterOutInactiveBuilders(self,
                                 builders: Iterable[data_types.BuilderEntry],
                                 builder_type: str
                                 ) -> List[data_types.BuilderEntry]:
    """Filters out any builders that are not producing data.

    This helps save time on querying, as querying for the builder names is cheap
    while querying for individual results from a builder is expensive. Filtering
    out inactive builders lets us preemptively remove builders that we know we
    won't get any data from, and thus don't need to waste time querying.

    Args:
      builders: An iterable of data_types.BuilderEntry containing the builders
          to query.
      builder_type: A string containing the type of builder to query, either
          "ci" or "try".

    Returns:
      A copy of |builders| with any inactive builders removed.
    """
    include_internal_builders = any(b.is_internal_builder for b in builders)
    query = self._GetActiveBuilderQuery(
        builder_type, include_internal_builders).encode('utf-8')
    cmd = GenerateBigQueryCommand(self._project, {}, batch=False)
    with open(os.devnull, 'w') as devnull:
      p = subprocess.Popen(cmd,
                           stdout=subprocess.PIPE,
                           stderr=devnull,
                           stdin=subprocess.PIPE)
    stdout, _ = p.communicate(query)
    if not isinstance(stdout, six.string_types):
      stdout = stdout.decode('utf-8')
    results = json.loads(stdout)

    # We filter from an initial list instead of directly using the returned
    # builders since there are cases where they aren't equivalent, such as for
    # GPU tests if a particular builder doesn't run a particular suite. This
    # could be encapsulated in the query, but this would cause the query to take
    # longer. Since generating the initial list locally is basically
    # instantenous and we're optimizing for runtime, filtering is the better
    # option.
    active_builders = {r['builder_name'] for r in results}
    filtered_builders = [b for b in builders if b.name in active_builders]
    return filtered_builders

  def _QueryAddCombined(
      self,
      inputs: Tuple[data_types.BuilderEntry, data_types.TestExpectationMap]
  ) -> Tuple[data_types.ResultListType, str, data_types.TestExpectationMap]:
    """Combines the query and add steps for use in a process pool.

    Args:
      inputs: An iterable of inputs for QueryBuilder() and
          data_types.TestExpectationMap.AddResultList(). Should be in the order:
          builder expectation_map

    Returns:
      The output of data_types.TestExpectationMap.AddResultList().
    """
    builder, expectation_map = inputs
    results, expectation_files = self.QueryBuilder(builder)

    prefixed_builder_name = '%s/%s:%s' % (builder.project, builder.builder_type,
                                          builder.name)
    unmatched_results = expectation_map.AddResultList(prefixed_builder_name,
                                                      results,
                                                      expectation_files)

    return unmatched_results, prefixed_builder_name, expectation_map

  def QueryBuilder(self, builder: data_types.BuilderEntry
                   ) -> Tuple[data_types.ResultListType, Optional[List[str]]]:
    """Queries ResultDB for results from |builder|.

    Args:
      builder: A data_types.BuilderEntry containing the builder to query.

    Returns:
      A tuple (results, expectation_files). |results| is the results returned by
      the query converted into a list of data_types.Result objects.
      |expectation_files| is a set of strings denoting which expectation files
      are relevant to |results|, or None if all should be used.
    """

    query_generator = self._GetQueryGeneratorForBuilder(builder)
    if not query_generator:
      # No affected tests on this builder, so early return.
      return [], None

    # Query for the test data from the builder, splitting the query if we run
    # into the BigQuery hard memory limit. Even if we keep failing, this will
    # eventually stop due to getting a QuerySplitError when we can't split the
    # query any further.
    query_results = None
    while query_results is None:
      try:
        query_results = self._RunBigQueryCommandsForJsonOutput(
            query_generator.GetQueries(), {
                '': {
                    'builder_name': builder.name
                },
                'INT64': {
                    'num_builds': self._num_samples
                }
            })
      except MemoryLimitError:
        logging.warning(
            'Query to builder %s hit BigQuery hard memory limit, trying again '
            'with more query splitting.', builder.name)
        query_generator.SplitQuery()

    results = []
    if not query_results:
      # Don't bother logging if we know this is a fake CI builder.
      if not (builder.builder_type == constants.BuilderTypes.CI
              and builder in builders_module.GetInstance().GetFakeCiBuilders()):
        logging.warning(
            'Did not get results for "%s", but this may be because its '
            'results do not apply to any expectations for this suite.',
            builder.name)
      return results, None

    # It's possible that a builder runs multiple versions of a test with
    # different expectation files for each version. So, find a result for each
    # unique step and get the expectation files from all of them.
    results_for_each_step = {}
    for qr in query_results:
      step_name = qr['step_name']
      if step_name not in results_for_each_step:
        results_for_each_step[step_name] = qr

    expectation_files = []
    for qr in results_for_each_step.values():
      # None is a special value indicating "use all expectation files", so
      # handle that.
      ef = self._GetRelevantExpectationFilesForQueryResult(qr)
      if ef is None:
        expectation_files = None
        break
      expectation_files.extend(ef)
    if expectation_files is not None:
      expectation_files = list(set(expectation_files))

    for r in query_results:
      if self._ShouldSkipOverResult(r):
        continue
      results.append(self._ConvertJsonResultToResultObject(r))
    logging.debug('Got %d results for %s builder %s', len(results),
                  builder.builder_type, builder.name)
    return results, expectation_files

  def _ConvertJsonResultToResultObject(self, json_result: QueryResult
                                       ) -> data_types.Result:
    """Converts a single BigQuery JSON result to a data_types.Result.

    Args:
      json_result: A single row/result from BigQuery in JSON format.

    Returns:
      A data_types.Result object containing the information from |json_result|.
    """
    build_id = _StripPrefixFromBuildId(json_result['id'])
    test_name = self._StripPrefixFromTestId(json_result['test_id'])
    actual_result = _ConvertActualResultToExpectationFileFormat(
        json_result['status'])
    tags = expectations.GetInstance().FilterToKnownTags(json_result['typ_tags'])
    step = json_result['step_name']
    return data_types.Result(test_name, tags, actual_result, step, build_id)

  def _GetRelevantExpectationFilesForQueryResult(self, query_result: QueryResult
                                                 ) -> Optional[Iterable[str]]:
    """Gets the relevant expectation file names for a given query result.

    Args:
      query_result: A dict containing single row/result from a BigQuery query.

    Returns:
      An iterable of strings containing expectation file names that are
      relevant to |query_result|, or None if all expectation files should be
      considered relevant.
    """
    raise NotImplementedError()

  def _ShouldSkipOverResult(self, result: QueryResult) -> bool:
    """Whether |result| should be ignored and skipped over.

    Args:
      result: A dict containing a single BigQuery result row.

    Returns:
      True if the result should be skipped over/ignored, otherwise False.
    """
    del result
    return False

  def _GetQueryGeneratorForBuilder(self, builder: data_types.BuilderEntry
                                   ) -> Optional['BaseQueryGenerator']:
    """Returns a BaseQueryGenerator instance to only include relevant tests.

    Args:
      builder: A data_types.BuilderEntry containing the builder to query.

    Returns:
      None if the query returned no results. Otherwise, some instance of a
      BaseQueryGenerator.
    """
    raise NotImplementedError()

  def _RunBigQueryCommandsForJsonOutput(self, queries: Union[str, List[str]],
                                        parameters: QueryParameters
                                        ) -> List[QueryResult]:
    """Runs the given BigQuery queries and returns their outputs as JSON.

    Args:
      queries: A string or list of strings containing valid BigQuery queries to
          run or a single string containing a query.
      parameters: A dict specifying parameters to substitute in the query in
          the format {type: {key: value}}. For example, the dict:
          {'INT64': {'num_builds': 5}}
          would result in --parameter=num_builds:INT64:5 being passed to
          BigQuery.

    Returns:
      The combined results of |queries| in JSON.
    """
    if isinstance(queries, str):
      queries = [queries]
    assert isinstance(queries, list)

    processes = set()
    processes_lock = threading.Lock()

    def run_cmd_in_thread(inputs: Tuple[List[str], str]) -> str:
      cmd, query = inputs
      query = query.encode('utf-8')
      with open(os.devnull, 'w') as devnull:
        with processes_lock:
          # Starting many queries at once causes us to hit rate limits much more
          # frequently, so stagger query starts to help avoid that.
          time.sleep(QUERY_DELAY)
          p = subprocess.Popen(cmd,
                               stdout=subprocess.PIPE,
                               stderr=devnull,
                               stdin=subprocess.PIPE)
          processes.add(p)

        # We pass in the query via stdin instead of including it on the
        # commandline because we can run into command length issues in large
        # query mode.
        stdout, _ = p.communicate(query)
        if not isinstance(stdout, six.string_types):
          stdout = stdout.decode('utf-8')
        if p.returncode:
          # When running many queries in parallel, it's possible to hit the
          # rate limit for the account if we're unlucky, so try again if we do.
          if 'Exceeded rate limits' in stdout:
            raise RateLimitError()
          error_msg = 'Error running command %s. stdout: %s' % (cmd, stdout)
          if 'memory' in stdout:
            raise MemoryLimitError(error_msg)
          raise RuntimeError(error_msg)
        return stdout

    def run_cmd(cmd: List[str], tries: int) -> List[str]:
      if tries >= MAX_QUERY_TRIES:
        raise RuntimeError('Query failed too many times, aborting')

      # We use a thread pool with a thread for each query/process instead of
      # just creating the processes due to guidance from the Python docs:
      # https://docs.python.org/3/library/subprocess.html#subprocess.Popen.stderr
      # We need to write to stdin to pass the query in, but using
      # stdout/stderr/stdin directly is discouraged due to the potential for
      # deadlocks. The suggested method (using .communicate()) blocks, so we
      # need the thread pool to maintain parallelism.
      pool = multiprocessing.pool.ThreadPool(len(queries))

      def cleanup():
        pool.terminate()
        for p in processes:
          try:
            p.terminate()
          except OSError:
            # We can fail to terminate if the process is already finished, so
            # ignore such failures.
            pass
        processes.clear()

      args = [(cmd, q) for q in queries]
      try:
        return pool.map(run_cmd_in_thread, args)
      except RateLimitError:
        logging.warning('Query hit rate limit, retrying')
        cleanup()
        return run_cmd(cmd, tries + 1)
      finally:
        cleanup()
      raise RuntimeError('Hit branch that should  be unreachable')

    bq_cmd = GenerateBigQueryCommand(self._project, parameters)
    stdouts = run_cmd(bq_cmd, 0)
    combined_json = []
    for result in [json.loads(s) for s in stdouts]:
      for row in result:
        combined_json.append(row)
    return combined_json

  def _StripPrefixFromTestId(self, test_id: str) -> str:
    """Strips the prefix from a test ID, leaving only the test case name.

    Args:
      test_id: A string containing a full ResultDB test ID, e.g.
          ninja://target/directory.suite.class.test_case

    Returns:
      A string containing the test cases name extracted from |test_id|.
    """
    raise NotImplementedError()

  def _GetActiveBuilderQuery(self, builder_type: str,
                             include_internal_builders: bool) -> str:
    """Gets the SQL query for determining which builders actually produce data.

    Args:
      builder_type: A string containing the type of builders to query, either
          "ci" or "try".
      include_internal_builders: A boolean indicating whether internal builders
          should be included in the data that the query will access.

    Returns:
      A string containing a SQL query that will get all the names of all
      relevant builders that are active/producing data.
    """
    raise NotImplementedError()


class BaseQueryGenerator(object):
  """Abstract base class for query generators."""

  def __init__(self, builder: data_types.BuilderEntry):
    self._builder = builder

  def SplitQuery(self) -> None:
    """Splits the query into more clauses/queries."""
    raise NotImplementedError('SplitQuery must be overridden in a child class')

  def GetClauses(self) -> List[str]:
    """Gets string representations of the test filters.

    Returns:
      A list of strings, each string being a valid SQL clause that applies a
      portion of the test filter to a query.
    """
    raise NotImplementedError('GetClauses must be overridden in a child class')

  def GetQueries(self) -> List[str]:
    """Gets string representations of the queries to run.

    Returns:
      A list of strings, each string being a valid SQL query that queries a
      portion of the tests of interest.
    """
    raise NotImplementedError('GetQueries must be overridden in a child class')


# pylint: disable=abstract-method
class FixedQueryGenerator(BaseQueryGenerator):
  """Concrete test filter that cannot be split."""

  def __init__(self, builder: data_types.BuilderEntry, test_filter: str):
    """
    Args:
      test_filter: A string containing the test filter SQL clause to use.
    """
    super(FixedQueryGenerator, self).__init__(builder)
    self._test_filter = test_filter

  def SplitQuery(self) -> None:
    raise QuerySplitError('Tried to split a query without any test IDs to use, '
                          'use --large-query-mode')

  def GetClauses(self) -> List[str]:
    return [self._test_filter]
# pylint: enable=abstract-method


# pylint: disable=abstract-method
class SplitQueryGenerator(BaseQueryGenerator):
  """Concrete test filter that can be split to a desired size."""

  def __init__(self, builder: data_types.BuilderEntry, test_ids: List[str],
               target_num_samples: int):
    """
    Args:
      test_ids: A list of strings containing the test IDs to use in the test
          test filter.
      target_num_samples: The target/max number of samples to get from each
          query that uses clauses from this test filter.
    """
    super(SplitQueryGenerator, self).__init__(builder)
    self._test_id_lists = []
    self._target_num_samples = target_num_samples
    self._clauses = []
    self._PerformInitialSplit(test_ids)

  def _PerformInitialSplit(self, test_ids: List[str]) -> None:
    """Evenly splits |test_ids| into lists that are  ~|_target_num_samples| long

    Only to be called from the constructor.

    Args:
      test_ids: A list of test IDs to split and assign to the _test_id_lists
          member.
    """
    assert isinstance(test_ids[0], six.string_types)

    num_lists = int(math.ceil(float(len(test_ids)) / self._target_num_samples))
    list_size = int(math.ceil(float(len(test_ids)) / num_lists))

    split_lists = []
    start = 0
    for _ in range(num_lists):
      end = min(len(test_ids), start + list_size)
      split_lists.append(test_ids[start:end])
      start = end
    self._test_id_lists = split_lists
    self._GenerateClauses()

  def _GenerateClauses(self) -> None:
    test_filter_clauses = []
    for id_list in self._test_id_lists:
      clause = 'AND test_id IN UNNEST([%s])' % ', '.join(id_list)
      test_filter_clauses.append(clause)
    self._clauses = test_filter_clauses

  def SplitQuery(self) -> None:
    def _SplitListInHalf(l: list) -> Tuple[list, list]:
      assert len(l) > 1
      front = l[:len(l) // 2]
      back = l[len(l) // 2:]
      return front, back

    tmp_test_id_lists = []
    for til in self._test_id_lists:
      if len(til) <= 1:
        raise QuerySplitError(
            'Cannot split query any further, try lowering --num-samples')
      front, back = _SplitListInHalf(til)
      tmp_test_id_lists.append(front)
      tmp_test_id_lists.append(back)
    self._test_id_lists = tmp_test_id_lists
    self._GenerateClauses()

  def GetClauses(self) -> List[str]:
    return self._clauses
# pylint: enable=abstract-method


def GenerateBigQueryCommand(project: str,
                            parameters: QueryParameters,
                            batch: bool = True) -> List[str]:
  """Generate a BigQuery commandline.

  Does not contain the actual query, as that is passed in via stdin.

  Args:
    project: A string containing the billing project to use for BigQuery.
    parameters: A dict specifying parameters to substitute in the query in
        the format {type: {key: value}}. For example, the dict:
        {'INT64': {'num_builds': 5}}
        would result in --parameter=num_builds:INT64:5 being passed to BigQuery.
    batch: Whether to run the query in batch mode or not. Batching adds some
        random amount of overhead since it means the query has to wait for idle
        resources, but also allows for much better parallelism.

  Returns:
    A list containing the BigQuery commandline, suitable to be passed to a
    method from the subprocess module.
  """
  cmd = [
      'bq',
      'query',
      '--max_rows=%d' % MAX_ROWS,
      '--format=json',
      '--project_id=%s' % project,
      '--use_legacy_sql=false',
  ]

  if batch:
    cmd.append('--batch')

  for parameter_type, parameter_pairs in parameters.items():
    for k, v in parameter_pairs.items():
      cmd.append('--parameter=%s:%s:%s' % (k, parameter_type, v))
  return cmd


def _StripPrefixFromBuildId(build_id: str) -> str:
  # Build IDs provided by ResultDB are prefixed with "build-"
  split_id = build_id.split('-')
  assert len(split_id) == 2
  return split_id[-1]


def _ConvertActualResultToExpectationFileFormat(actual_result: str) -> str:
  # Web tests use ResultDB's ABORT value for both test timeouts and device
  # failures, but Abort is not defined in typ. So, map it to timeout now.
  if actual_result == 'ABORT':
    actual_result = json_results.ResultType.Timeout
  # The result reported to ResultDB is in the format PASS/FAIL, while the
  # expected results in an expectation file are in the format Pass/Failure.
  return expectations_parser.RESULT_TAGS[actual_result]


class RateLimitError(Exception):
  """Exception raised when BigQuery hits a rate limit error."""


class MemoryLimitError(Exception):
  """Exception raised when BigQuery hits its hard memory limit."""


class QuerySplitError(Exception):
  """Exception raised when a query cannot be split any further."""