unplugged-system/external/cronet/base/substring_set_matcher/substring_set_matcher.h

// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
#define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_

#include <stdint.h>

#include <limits>
#include <set>
#include <string>
#include <vector>

#include "base/base_export.h"
#include "base/check_op.h"
#include "base/memory/raw_ptr_exclusion.h"
#include "base/substring_set_matcher/matcher_string_pattern.h"

namespace base {

// Class that store a set of string patterns and can find for a string S,
// which string patterns occur in S.
class BASE_EXPORT SubstringSetMatcher {
 public:
  SubstringSetMatcher();
  SubstringSetMatcher(const SubstringSetMatcher&) = delete;
  SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;

  ~SubstringSetMatcher();

  // Registers all |patterns|. Each pattern needs to have a unique ID and all
  // pattern strings must be unique. Build() should be called exactly once
  // (before it is called, the tree is empty).
  //
  // Complexity:
  //    Let n = number of patterns.
  //    Let S = sum of pattern lengths.
  //    Let k = range of char. Generally 256.
  // Complexity = O(nlogn + S * logk)
  // nlogn comes from sorting the patterns.
  // log(k) comes from our usage of std::map to store edges.
  //
  // Returns true on success (may fail if e.g. if the tree gets too many nodes).
  bool Build(const std::vector<MatcherStringPattern>& patterns);
  bool Build(std::vector<const MatcherStringPattern*> patterns);

  // Matches |text| against all registered MatcherStringPatterns. Stores the IDs
  // of matching patterns in |matches|. |matches| is not cleared before adding
  // to it.
  // Complexity:
  //    Let t = length of |text|.
  //    Let k = range of char. Generally 256.
  //    Let z = number of matches returned.
  // Complexity = O(t * logk + zlogz)
  bool Match(const std::string& text,
             std::set<MatcherStringPattern::ID>* matches) const;

  // As Match(), except it returns immediately on the first match.
  // This allows true/false matching to be done without any dynamic
  // memory allocation.
  // Complexity = O(t * logk)
  bool AnyMatch(const std::string& text) const;

  // Returns true if this object retains no allocated data.
  bool IsEmpty() const { return is_empty_; }

  // Returns the dynamically allocated memory usage in bytes. See
  // base/trace_event/memory_usage_estimator.h for details.
  size_t EstimateMemoryUsage() const;

 private:
  // Represents the index of the node within |tree_|. It is specifically
  // uint32_t so that we can be sure it takes up 4 bytes when stored together
  // with the 9-bit label (so 23 bits are allocated to the NodeID, even though
  // it is exposed as uint32_t). If the computed size of |tree_| is
  // larger than what can be stored within 23 bits, Build() will fail.
  using NodeID = uint32_t;

  // This is the maximum possible size of |tree_| and hence can't be a valid ID.
  static constexpr NodeID kInvalidNodeID = (1u << 23) - 1;

  static constexpr NodeID kRootID = 0;

  // A node of an Aho Corasick Tree. See
  // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
  // to understand the algorithm.
  //
  // The algorithm is based on the idea of building a trie of all registered
  // patterns. Each node of the tree is annotated with a set of pattern
  // IDs that are used to report matches.
  //
  // The root of the trie represents an empty match. If we were looking whether
  // any registered pattern matches a text at the beginning of the text (i.e.
  // whether any pattern is a prefix of the text), we could just follow
  // nodes in the trie according to the matching characters in the text.
  // E.g., if text == "foobar", we would follow the trie from the root node
  // to its child labeled 'f', from there to child 'o', etc. In this process we
  // would report all pattern IDs associated with the trie nodes as matches.
  //
  // As we are not looking for all prefix matches but all substring matches,
  // this algorithm would need to compare text.substr(0), text.substr(1), ...
  // against the trie, which is in O(|text|^2).
  //
  // The Aho Corasick algorithm improves this runtime by using failure edges.
  // In case we have found a partial match of length k in the text
  // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
  // a node at depth k, but cannot find a match in the trie for character
  // text[i + k] at depth k + 1, we follow a failure edge. This edge
  // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
  // is a prefix of any registered pattern.
  //
  // If your brain thinks "Forget it, let's go shopping.", don't worry.
  // Take a nap and read an introductory text on the Aho Corasick algorithm.
  // It will make sense. Eventually.

  // An edge internal to the tree. We pack the label (character we are
  // matching on) and the destination node ID into 32 bits, to save memory.
  // We also use these edges as a sort of generic key/value store for
  // some special values that not all nodes will have; this also saves on
  // memory over the otherwise obvious choice of having them as struct fields,
  // as it means we do not to store them when they are not present.
  struct AhoCorasickEdge {
    // char (unsigned, so [0..255]), or a special label below.
    uint32_t label : 9;
    NodeID node_id : 23;
  };

  // Node index that failure edge leads to. The failure node corresponds to
  // the node which represents the longest proper suffix (include empty
  // string) of the string represented by this node. Not stored if it is
  // equal to kRootID (since that is the most common value).
  //
  // NOTE: Assigning |root| as the failure edge for itself doesn't strictly
  // abide by the definition of "proper" suffix. The proper suffix of an empty
  // string should probably be defined as null, but we assign it to the |root|
  // to simplify the code and have the invariant that the failure edge is always
  // defined.
  static constexpr uint32_t kFailureNodeLabel = 0x100;
  static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;

  // Node index that corresponds to the longest proper suffix (including empty
  // suffix) of this node and which also represents the end of a pattern.
  // Does not have to exist.
  static constexpr uint32_t kOutputLinkLabel = 0x101;

  // If present, this node represents the end of a pattern. It stores the ID of
  // the corresponding pattern (ie., it is not really a NodeID, but a
  // MatcherStringPattern::ID).
  static constexpr uint32_t kMatchIDLabel = 0x102;

  // Used for uninitialized label slots; used so that we do not have to test for
  // them in other ways, since we know the data will be initialized and never
  // match any other labels.
  static constexpr uint32_t kEmptyLabel = 0x103;

  // A node in the trie, packed tightly together so that it occupies 12 bytes
  // (both on 32- and 64-bit platforms), but aligned to at least 4 (see the
  // comment on edges_).
  class alignas(AhoCorasickEdge) AhoCorasickNode {
   public:
    AhoCorasickNode();
    ~AhoCorasickNode();
    AhoCorasickNode(AhoCorasickNode&& other);
    AhoCorasickNode& operator=(AhoCorasickNode&& other);

    NodeID GetEdge(uint32_t label) const {
      if (edges_capacity_ != 0) {
        return GetEdgeNoInline(label);
      }
      static_assert(kNumInlineEdges == 2, "Code below needs updating");
      if (edges_.inline_edges[0].label == label) {
        return edges_.inline_edges[0].node_id;
      }
      if (edges_.inline_edges[1].label == label) {
        return edges_.inline_edges[1].node_id;
      }
      return kInvalidNodeID;
    }
    NodeID GetEdgeNoInline(uint32_t label) const;
    void SetEdge(uint32_t label, NodeID node);
    const AhoCorasickEdge* edges() const {
      // NOTE: Returning edges_.inline_edges here is fine, because it's
      // the first thing in the struct (see the comment on edges_).
      DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) %
                        alignof(AhoCorasickEdge));
      return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
    }

    NodeID failure() const {
      // NOTE: Even if num_edges_ == 0, we are not doing anything
      // undefined, as we will have room for at least two edges
      // and empty edges are set to kEmptyLabel.
      const AhoCorasickEdge& first_edge = *edges();
      if (first_edge.label == kFailureNodeLabel) {
        return first_edge.node_id;
      } else {
        return kRootID;
      }
    }
    void SetFailure(NodeID failure);

    void SetMatchID(MatcherStringPattern::ID id) {
      DCHECK(!IsEndOfPattern());
      DCHECK(id < kInvalidNodeID);  // This is enforced by Build().
      SetEdge(kMatchIDLabel, static_cast<NodeID>(id));
      has_outputs_ = true;
    }

    // Returns true if this node corresponds to a pattern.
    bool IsEndOfPattern() const {
      if (!has_outputs_) {
        // Fast reject.
        return false;
      }
      return GetEdge(kMatchIDLabel) != kInvalidNodeID;
    }

    // Must only be called if |IsEndOfPattern| returns true for this node.
    MatcherStringPattern::ID GetMatchID() const {
      DCHECK(IsEndOfPattern());
      return GetEdge(kMatchIDLabel);
    }

    void SetOutputLink(NodeID node) {
      if (node != kInvalidNodeID) {
        SetEdge(kOutputLinkLabel, node);
        has_outputs_ = true;
      }
    }
    NodeID output_link() const { return GetEdge(kOutputLinkLabel); }

    size_t EstimateMemoryUsage() const;
    size_t num_edges() const {
      if (edges_capacity_ == 0) {
        return kNumInlineEdges - num_free_edges_;
      } else {
        return edges_capacity_ - num_free_edges_;
      }
    }

    bool has_outputs() const { return has_outputs_; }

   private:
    // Outgoing edges of current node, including failure edge and output links.
    // Most nodes have only one or two (or even zero) edges, not the last
    // because many of them are leaves. Thus, we make an optimization for this
    // common case; instead of a pointer to an edge array on the heap, we can
    // pack two edges inline where the pointer would otherwise be. This reduces
    // memory usage dramatically, as well as saving us a cache-line fetch.
    //
    // Note that even though most nodes have fewer outgoing edges, most nodes
    // that we actually traverse will have any of them. This apparent
    // contradiction is because we tend to spend more of our time near the root
    // of the trie, where it is wide. This means that another layout would be
    // possible: If we wanted to, non-inline nodes could simply store an array
    // of 259 (256 possible characters plus the three special label types)
    // edges, indexed directly by label type. This would use 20–50% more RAM,
    // but also increases the speed of lookups due to removing the search loop.
    //
    // The nodes are generally unordered; since we typically index text, even
    // the root will rarely be more than 20–30 wide, and at that point, it's
    // better to just do a linear search than a binary one (which fares poorly
    // on branch predictors). However, a special case, we put kFailureNodeLabel
    // in the first slot if it exists (ie., is not equal to kRootID), since we
    // need to access that label during every single node we look at during
    // traversal.
    //
    // NOTE: Keep this the first member in the struct, so that inline_edges gets
    // 4-aligned (since the class is marked as such, despite being packed.
    // Otherwise, edges() can return an unaligned pointer marked as aligned
    // (the unalignedness gets lost).
    static constexpr int kNumInlineEdges = 2;
    union {
      // Out-of-line edge storage, having room for edges_capacity_ elements.
      // Note that due to __attribute__((packed)) below, this pointer may be
      // unaligned on 64-bit platforms, causing slightly less efficient
      // access to it in some cases.
      // This field is not a raw_ptr<> because it was filtered by the rewriter
      // for: #union
      RAW_PTR_EXCLUSION AhoCorasickEdge* edges;

      // Inline edge storage, used if edges_capacity_ == 0.
      AhoCorasickEdge inline_edges[kNumInlineEdges];
    } edges_;

    // Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
    // ie., hitting this node during traversal will create one or more
    // matches. This is redundant, but since every single lookup during
    // traversal needs this, it saves a few searches for us.
    bool has_outputs_ = false;

    // Number of unused left in edges_. Edges are always allocated from the
    // beginning and never deleted; those after num_edges_ will be marked with
    // kEmptyLabel (and have an undefined node_id). We store the number of
    // free edges instead of the more common number of _used_ edges, to be
    // sure that we are able to fit it in an uint8_t. num_edges() provides
    // a useful abstraction over this.
    uint8_t num_free_edges_ = kNumInlineEdges;

    // How many edges we have allocated room for (can never be more than
    // kEmptyLabel + 1). If equal to zero, we are not using heap storage,
    // but instead are using inline_edges.
    //
    // If not equal to zero, will be a multiple of 4, so that we can use
    // SIMD to accelerate looking for edges.
    uint16_t edges_capacity_ = 0;
  } __attribute__((packed));

  using SubstringPatternVector = std::vector<const MatcherStringPattern*>;

  // Given the set of patterns, compute how many nodes will the corresponding
  // Aho-Corasick tree have. Note that |patterns| need to be sorted.
  NodeID GetTreeSize(
      const std::vector<const MatcherStringPattern*>& patterns) const;

  void BuildAhoCorasickTree(const SubstringPatternVector& patterns);

  // Inserts a path for |pattern->pattern()| into the tree and adds
  // |pattern->id()| to the set of matches.
  void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern);

  void CreateFailureAndOutputEdges();

  // Adds all pattern IDs to |matches| which are a suffix of the string
  // represented by |node|.
  void AccumulateMatchesForNode(
      const AhoCorasickNode* node,
      std::set<MatcherStringPattern::ID>* matches) const;

  // The nodes of a Aho-Corasick tree.
  std::vector<AhoCorasickNode> tree_;

  bool is_empty_ = true;
};

}  // namespace base

#endif  // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_