340 lines
14 KiB
C++
340 lines
14 KiB
C++
// Copyright 2013 The Chromium Authors
|
||
// Use of this source code is governed by a BSD-style license that can be
|
||
// found in the LICENSE file.
|
||
|
||
#ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
|
||
#define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
|
||
|
||
#include <stdint.h>
|
||
|
||
#include <limits>
|
||
#include <set>
|
||
#include <string>
|
||
#include <vector>
|
||
|
||
#include "base/base_export.h"
|
||
#include "base/check_op.h"
|
||
#include "base/memory/raw_ptr_exclusion.h"
|
||
#include "base/substring_set_matcher/matcher_string_pattern.h"
|
||
|
||
namespace base {
|
||
|
||
// Class that store a set of string patterns and can find for a string S,
|
||
// which string patterns occur in S.
|
||
class BASE_EXPORT SubstringSetMatcher {
|
||
public:
|
||
SubstringSetMatcher();
|
||
SubstringSetMatcher(const SubstringSetMatcher&) = delete;
|
||
SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;
|
||
|
||
~SubstringSetMatcher();
|
||
|
||
// Registers all |patterns|. Each pattern needs to have a unique ID and all
|
||
// pattern strings must be unique. Build() should be called exactly once
|
||
// (before it is called, the tree is empty).
|
||
//
|
||
// Complexity:
|
||
// Let n = number of patterns.
|
||
// Let S = sum of pattern lengths.
|
||
// Let k = range of char. Generally 256.
|
||
// Complexity = O(nlogn + S * logk)
|
||
// nlogn comes from sorting the patterns.
|
||
// log(k) comes from our usage of std::map to store edges.
|
||
//
|
||
// Returns true on success (may fail if e.g. if the tree gets too many nodes).
|
||
bool Build(const std::vector<MatcherStringPattern>& patterns);
|
||
bool Build(std::vector<const MatcherStringPattern*> patterns);
|
||
|
||
// Matches |text| against all registered MatcherStringPatterns. Stores the IDs
|
||
// of matching patterns in |matches|. |matches| is not cleared before adding
|
||
// to it.
|
||
// Complexity:
|
||
// Let t = length of |text|.
|
||
// Let k = range of char. Generally 256.
|
||
// Let z = number of matches returned.
|
||
// Complexity = O(t * logk + zlogz)
|
||
bool Match(const std::string& text,
|
||
std::set<MatcherStringPattern::ID>* matches) const;
|
||
|
||
// As Match(), except it returns immediately on the first match.
|
||
// This allows true/false matching to be done without any dynamic
|
||
// memory allocation.
|
||
// Complexity = O(t * logk)
|
||
bool AnyMatch(const std::string& text) const;
|
||
|
||
// Returns true if this object retains no allocated data.
|
||
bool IsEmpty() const { return is_empty_; }
|
||
|
||
// Returns the dynamically allocated memory usage in bytes. See
|
||
// base/trace_event/memory_usage_estimator.h for details.
|
||
size_t EstimateMemoryUsage() const;
|
||
|
||
private:
|
||
// Represents the index of the node within |tree_|. It is specifically
|
||
// uint32_t so that we can be sure it takes up 4 bytes when stored together
|
||
// with the 9-bit label (so 23 bits are allocated to the NodeID, even though
|
||
// it is exposed as uint32_t). If the computed size of |tree_| is
|
||
// larger than what can be stored within 23 bits, Build() will fail.
|
||
using NodeID = uint32_t;
|
||
|
||
// This is the maximum possible size of |tree_| and hence can't be a valid ID.
|
||
static constexpr NodeID kInvalidNodeID = (1u << 23) - 1;
|
||
|
||
static constexpr NodeID kRootID = 0;
|
||
|
||
// A node of an Aho Corasick Tree. See
|
||
// http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
|
||
// to understand the algorithm.
|
||
//
|
||
// The algorithm is based on the idea of building a trie of all registered
|
||
// patterns. Each node of the tree is annotated with a set of pattern
|
||
// IDs that are used to report matches.
|
||
//
|
||
// The root of the trie represents an empty match. If we were looking whether
|
||
// any registered pattern matches a text at the beginning of the text (i.e.
|
||
// whether any pattern is a prefix of the text), we could just follow
|
||
// nodes in the trie according to the matching characters in the text.
|
||
// E.g., if text == "foobar", we would follow the trie from the root node
|
||
// to its child labeled 'f', from there to child 'o', etc. In this process we
|
||
// would report all pattern IDs associated with the trie nodes as matches.
|
||
//
|
||
// As we are not looking for all prefix matches but all substring matches,
|
||
// this algorithm would need to compare text.substr(0), text.substr(1), ...
|
||
// against the trie, which is in O(|text|^2).
|
||
//
|
||
// The Aho Corasick algorithm improves this runtime by using failure edges.
|
||
// In case we have found a partial match of length k in the text
|
||
// (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
|
||
// a node at depth k, but cannot find a match in the trie for character
|
||
// text[i + k] at depth k + 1, we follow a failure edge. This edge
|
||
// corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
|
||
// is a prefix of any registered pattern.
|
||
//
|
||
// If your brain thinks "Forget it, let's go shopping.", don't worry.
|
||
// Take a nap and read an introductory text on the Aho Corasick algorithm.
|
||
// It will make sense. Eventually.
|
||
|
||
// An edge internal to the tree. We pack the label (character we are
|
||
// matching on) and the destination node ID into 32 bits, to save memory.
|
||
// We also use these edges as a sort of generic key/value store for
|
||
// some special values that not all nodes will have; this also saves on
|
||
// memory over the otherwise obvious choice of having them as struct fields,
|
||
// as it means we do not to store them when they are not present.
|
||
struct AhoCorasickEdge {
|
||
// char (unsigned, so [0..255]), or a special label below.
|
||
uint32_t label : 9;
|
||
NodeID node_id : 23;
|
||
};
|
||
|
||
// Node index that failure edge leads to. The failure node corresponds to
|
||
// the node which represents the longest proper suffix (include empty
|
||
// string) of the string represented by this node. Not stored if it is
|
||
// equal to kRootID (since that is the most common value).
|
||
//
|
||
// NOTE: Assigning |root| as the failure edge for itself doesn't strictly
|
||
// abide by the definition of "proper" suffix. The proper suffix of an empty
|
||
// string should probably be defined as null, but we assign it to the |root|
|
||
// to simplify the code and have the invariant that the failure edge is always
|
||
// defined.
|
||
static constexpr uint32_t kFailureNodeLabel = 0x100;
|
||
static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;
|
||
|
||
// Node index that corresponds to the longest proper suffix (including empty
|
||
// suffix) of this node and which also represents the end of a pattern.
|
||
// Does not have to exist.
|
||
static constexpr uint32_t kOutputLinkLabel = 0x101;
|
||
|
||
// If present, this node represents the end of a pattern. It stores the ID of
|
||
// the corresponding pattern (ie., it is not really a NodeID, but a
|
||
// MatcherStringPattern::ID).
|
||
static constexpr uint32_t kMatchIDLabel = 0x102;
|
||
|
||
// Used for uninitialized label slots; used so that we do not have to test for
|
||
// them in other ways, since we know the data will be initialized and never
|
||
// match any other labels.
|
||
static constexpr uint32_t kEmptyLabel = 0x103;
|
||
|
||
// A node in the trie, packed tightly together so that it occupies 12 bytes
|
||
// (both on 32- and 64-bit platforms), but aligned to at least 4 (see the
|
||
// comment on edges_).
|
||
class alignas(AhoCorasickEdge) AhoCorasickNode {
|
||
public:
|
||
AhoCorasickNode();
|
||
~AhoCorasickNode();
|
||
AhoCorasickNode(AhoCorasickNode&& other);
|
||
AhoCorasickNode& operator=(AhoCorasickNode&& other);
|
||
|
||
NodeID GetEdge(uint32_t label) const {
|
||
if (edges_capacity_ != 0) {
|
||
return GetEdgeNoInline(label);
|
||
}
|
||
static_assert(kNumInlineEdges == 2, "Code below needs updating");
|
||
if (edges_.inline_edges[0].label == label) {
|
||
return edges_.inline_edges[0].node_id;
|
||
}
|
||
if (edges_.inline_edges[1].label == label) {
|
||
return edges_.inline_edges[1].node_id;
|
||
}
|
||
return kInvalidNodeID;
|
||
}
|
||
NodeID GetEdgeNoInline(uint32_t label) const;
|
||
void SetEdge(uint32_t label, NodeID node);
|
||
const AhoCorasickEdge* edges() const {
|
||
// NOTE: Returning edges_.inline_edges here is fine, because it's
|
||
// the first thing in the struct (see the comment on edges_).
|
||
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) %
|
||
alignof(AhoCorasickEdge));
|
||
return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
|
||
}
|
||
|
||
NodeID failure() const {
|
||
// NOTE: Even if num_edges_ == 0, we are not doing anything
|
||
// undefined, as we will have room for at least two edges
|
||
// and empty edges are set to kEmptyLabel.
|
||
const AhoCorasickEdge& first_edge = *edges();
|
||
if (first_edge.label == kFailureNodeLabel) {
|
||
return first_edge.node_id;
|
||
} else {
|
||
return kRootID;
|
||
}
|
||
}
|
||
void SetFailure(NodeID failure);
|
||
|
||
void SetMatchID(MatcherStringPattern::ID id) {
|
||
DCHECK(!IsEndOfPattern());
|
||
DCHECK(id < kInvalidNodeID); // This is enforced by Build().
|
||
SetEdge(kMatchIDLabel, static_cast<NodeID>(id));
|
||
has_outputs_ = true;
|
||
}
|
||
|
||
// Returns true if this node corresponds to a pattern.
|
||
bool IsEndOfPattern() const {
|
||
if (!has_outputs_) {
|
||
// Fast reject.
|
||
return false;
|
||
}
|
||
return GetEdge(kMatchIDLabel) != kInvalidNodeID;
|
||
}
|
||
|
||
// Must only be called if |IsEndOfPattern| returns true for this node.
|
||
MatcherStringPattern::ID GetMatchID() const {
|
||
DCHECK(IsEndOfPattern());
|
||
return GetEdge(kMatchIDLabel);
|
||
}
|
||
|
||
void SetOutputLink(NodeID node) {
|
||
if (node != kInvalidNodeID) {
|
||
SetEdge(kOutputLinkLabel, node);
|
||
has_outputs_ = true;
|
||
}
|
||
}
|
||
NodeID output_link() const { return GetEdge(kOutputLinkLabel); }
|
||
|
||
size_t EstimateMemoryUsage() const;
|
||
size_t num_edges() const {
|
||
if (edges_capacity_ == 0) {
|
||
return kNumInlineEdges - num_free_edges_;
|
||
} else {
|
||
return edges_capacity_ - num_free_edges_;
|
||
}
|
||
}
|
||
|
||
bool has_outputs() const { return has_outputs_; }
|
||
|
||
private:
|
||
// Outgoing edges of current node, including failure edge and output links.
|
||
// Most nodes have only one or two (or even zero) edges, not the last
|
||
// because many of them are leaves. Thus, we make an optimization for this
|
||
// common case; instead of a pointer to an edge array on the heap, we can
|
||
// pack two edges inline where the pointer would otherwise be. This reduces
|
||
// memory usage dramatically, as well as saving us a cache-line fetch.
|
||
//
|
||
// Note that even though most nodes have fewer outgoing edges, most nodes
|
||
// that we actually traverse will have any of them. This apparent
|
||
// contradiction is because we tend to spend more of our time near the root
|
||
// of the trie, where it is wide. This means that another layout would be
|
||
// possible: If we wanted to, non-inline nodes could simply store an array
|
||
// of 259 (256 possible characters plus the three special label types)
|
||
// edges, indexed directly by label type. This would use 20–50% more RAM,
|
||
// but also increases the speed of lookups due to removing the search loop.
|
||
//
|
||
// The nodes are generally unordered; since we typically index text, even
|
||
// the root will rarely be more than 20–30 wide, and at that point, it's
|
||
// better to just do a linear search than a binary one (which fares poorly
|
||
// on branch predictors). However, a special case, we put kFailureNodeLabel
|
||
// in the first slot if it exists (ie., is not equal to kRootID), since we
|
||
// need to access that label during every single node we look at during
|
||
// traversal.
|
||
//
|
||
// NOTE: Keep this the first member in the struct, so that inline_edges gets
|
||
// 4-aligned (since the class is marked as such, despite being packed.
|
||
// Otherwise, edges() can return an unaligned pointer marked as aligned
|
||
// (the unalignedness gets lost).
|
||
static constexpr int kNumInlineEdges = 2;
|
||
union {
|
||
// Out-of-line edge storage, having room for edges_capacity_ elements.
|
||
// Note that due to __attribute__((packed)) below, this pointer may be
|
||
// unaligned on 64-bit platforms, causing slightly less efficient
|
||
// access to it in some cases.
|
||
// This field is not a raw_ptr<> because it was filtered by the rewriter
|
||
// for: #union
|
||
RAW_PTR_EXCLUSION AhoCorasickEdge* edges;
|
||
|
||
// Inline edge storage, used if edges_capacity_ == 0.
|
||
AhoCorasickEdge inline_edges[kNumInlineEdges];
|
||
} edges_;
|
||
|
||
// Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
|
||
// ie., hitting this node during traversal will create one or more
|
||
// matches. This is redundant, but since every single lookup during
|
||
// traversal needs this, it saves a few searches for us.
|
||
bool has_outputs_ = false;
|
||
|
||
// Number of unused left in edges_. Edges are always allocated from the
|
||
// beginning and never deleted; those after num_edges_ will be marked with
|
||
// kEmptyLabel (and have an undefined node_id). We store the number of
|
||
// free edges instead of the more common number of _used_ edges, to be
|
||
// sure that we are able to fit it in an uint8_t. num_edges() provides
|
||
// a useful abstraction over this.
|
||
uint8_t num_free_edges_ = kNumInlineEdges;
|
||
|
||
// How many edges we have allocated room for (can never be more than
|
||
// kEmptyLabel + 1). If equal to zero, we are not using heap storage,
|
||
// but instead are using inline_edges.
|
||
//
|
||
// If not equal to zero, will be a multiple of 4, so that we can use
|
||
// SIMD to accelerate looking for edges.
|
||
uint16_t edges_capacity_ = 0;
|
||
} __attribute__((packed));
|
||
|
||
using SubstringPatternVector = std::vector<const MatcherStringPattern*>;
|
||
|
||
// Given the set of patterns, compute how many nodes will the corresponding
|
||
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
|
||
NodeID GetTreeSize(
|
||
const std::vector<const MatcherStringPattern*>& patterns) const;
|
||
|
||
void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
|
||
|
||
// Inserts a path for |pattern->pattern()| into the tree and adds
|
||
// |pattern->id()| to the set of matches.
|
||
void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern);
|
||
|
||
void CreateFailureAndOutputEdges();
|
||
|
||
// Adds all pattern IDs to |matches| which are a suffix of the string
|
||
// represented by |node|.
|
||
void AccumulateMatchesForNode(
|
||
const AhoCorasickNode* node,
|
||
std::set<MatcherStringPattern::ID>* matches) const;
|
||
|
||
// The nodes of a Aho-Corasick tree.
|
||
std::vector<AhoCorasickNode> tree_;
|
||
|
||
bool is_empty_ = true;
|
||
};
|
||
|
||
} // namespace base
|
||
|
||
#endif // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
|