340 lines
14 KiB
C
340 lines
14 KiB
C
|
|
// Copyright 2013 The Chromium Authors
|
|||
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|||
|
|
// found in the LICENSE file.
|
|||
|
|
|
|||
|
|
#ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
|
|||
|
|
#define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
|
|||
|
|
|
|||
|
|
#include <stdint.h>
|
|||
|
|
|
|||
|
|
#include <limits>
|
|||
|
|
#include <set>
|
|||
|
|
#include <string>
|
|||
|
|
#include <vector>
|
|||
|
|
|
|||
|
|
#include "base/base_export.h"
|
|||
|
|
#include "base/check_op.h"
|
|||
|
|
#include "base/memory/raw_ptr_exclusion.h"
|
|||
|
|
#include "base/substring_set_matcher/matcher_string_pattern.h"
|
|||
|
|
|
|||
|
|
namespace base {
|
|||
|
|
|
|||
|
|
// Class that store a set of string patterns and can find for a string S,
|
|||
|
|
// which string patterns occur in S.
|
|||
|
|
class BASE_EXPORT SubstringSetMatcher {
|
|||
|
|
public:
|
|||
|
|
SubstringSetMatcher();
|
|||
|
|
SubstringSetMatcher(const SubstringSetMatcher&) = delete;
|
|||
|
|
SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;
|
|||
|
|
|
|||
|
|
~SubstringSetMatcher();
|
|||
|
|
|
|||
|
|
// Registers all |patterns|. Each pattern needs to have a unique ID and all
|
|||
|
|
// pattern strings must be unique. Build() should be called exactly once
|
|||
|
|
// (before it is called, the tree is empty).
|
|||
|
|
//
|
|||
|
|
// Complexity:
|
|||
|
|
// Let n = number of patterns.
|
|||
|
|
// Let S = sum of pattern lengths.
|
|||
|
|
// Let k = range of char. Generally 256.
|
|||
|
|
// Complexity = O(nlogn + S * logk)
|
|||
|
|
// nlogn comes from sorting the patterns.
|
|||
|
|
// log(k) comes from our usage of std::map to store edges.
|
|||
|
|
//
|
|||
|
|
// Returns true on success (may fail if e.g. if the tree gets too many nodes).
|
|||
|
|
bool Build(const std::vector<MatcherStringPattern>& patterns);
|
|||
|
|
bool Build(std::vector<const MatcherStringPattern*> patterns);
|
|||
|
|
|
|||
|
|
// Matches |text| against all registered MatcherStringPatterns. Stores the IDs
|
|||
|
|
// of matching patterns in |matches|. |matches| is not cleared before adding
|
|||
|
|
// to it.
|
|||
|
|
// Complexity:
|
|||
|
|
// Let t = length of |text|.
|
|||
|
|
// Let k = range of char. Generally 256.
|
|||
|
|
// Let z = number of matches returned.
|
|||
|
|
// Complexity = O(t * logk + zlogz)
|
|||
|
|
bool Match(const std::string& text,
|
|||
|
|
std::set<MatcherStringPattern::ID>* matches) const;
|
|||
|
|
|
|||
|
|
// As Match(), except it returns immediately on the first match.
|
|||
|
|
// This allows true/false matching to be done without any dynamic
|
|||
|
|
// memory allocation.
|
|||
|
|
// Complexity = O(t * logk)
|
|||
|
|
bool AnyMatch(const std::string& text) const;
|
|||
|
|
|
|||
|
|
// Returns true if this object retains no allocated data.
|
|||
|
|
bool IsEmpty() const { return is_empty_; }
|
|||
|
|
|
|||
|
|
// Returns the dynamically allocated memory usage in bytes. See
|
|||
|
|
// base/trace_event/memory_usage_estimator.h for details.
|
|||
|
|
size_t EstimateMemoryUsage() const;
|
|||
|
|
|
|||
|
|
private:
|
|||
|
|
// Represents the index of the node within |tree_|. It is specifically
|
|||
|
|
// uint32_t so that we can be sure it takes up 4 bytes when stored together
|
|||
|
|
// with the 9-bit label (so 23 bits are allocated to the NodeID, even though
|
|||
|
|
// it is exposed as uint32_t). If the computed size of |tree_| is
|
|||
|
|
// larger than what can be stored within 23 bits, Build() will fail.
|
|||
|
|
using NodeID = uint32_t;
|
|||
|
|
|
|||
|
|
// This is the maximum possible size of |tree_| and hence can't be a valid ID.
|
|||
|
|
static constexpr NodeID kInvalidNodeID = (1u << 23) - 1;
|
|||
|
|
|
|||
|
|
static constexpr NodeID kRootID = 0;
|
|||
|
|
|
|||
|
|
// A node of an Aho Corasick Tree. See
|
|||
|
|
// http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
|
|||
|
|
// to understand the algorithm.
|
|||
|
|
//
|
|||
|
|
// The algorithm is based on the idea of building a trie of all registered
|
|||
|
|
// patterns. Each node of the tree is annotated with a set of pattern
|
|||
|
|
// IDs that are used to report matches.
|
|||
|
|
//
|
|||
|
|
// The root of the trie represents an empty match. If we were looking whether
|
|||
|
|
// any registered pattern matches a text at the beginning of the text (i.e.
|
|||
|
|
// whether any pattern is a prefix of the text), we could just follow
|
|||
|
|
// nodes in the trie according to the matching characters in the text.
|
|||
|
|
// E.g., if text == "foobar", we would follow the trie from the root node
|
|||
|
|
// to its child labeled 'f', from there to child 'o', etc. In this process we
|
|||
|
|
// would report all pattern IDs associated with the trie nodes as matches.
|
|||
|
|
//
|
|||
|
|
// As we are not looking for all prefix matches but all substring matches,
|
|||
|
|
// this algorithm would need to compare text.substr(0), text.substr(1), ...
|
|||
|
|
// against the trie, which is in O(|text|^2).
|
|||
|
|
//
|
|||
|
|
// The Aho Corasick algorithm improves this runtime by using failure edges.
|
|||
|
|
// In case we have found a partial match of length k in the text
|
|||
|
|
// (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
|
|||
|
|
// a node at depth k, but cannot find a match in the trie for character
|
|||
|
|
// text[i + k] at depth k + 1, we follow a failure edge. This edge
|
|||
|
|
// corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
|
|||
|
|
// is a prefix of any registered pattern.
|
|||
|
|
//
|
|||
|
|
// If your brain thinks "Forget it, let's go shopping.", don't worry.
|
|||
|
|
// Take a nap and read an introductory text on the Aho Corasick algorithm.
|
|||
|
|
// It will make sense. Eventually.
|
|||
|
|
|
|||
|
|
// An edge internal to the tree. We pack the label (character we are
|
|||
|
|
// matching on) and the destination node ID into 32 bits, to save memory.
|
|||
|
|
// We also use these edges as a sort of generic key/value store for
|
|||
|
|
// some special values that not all nodes will have; this also saves on
|
|||
|
|
// memory over the otherwise obvious choice of having them as struct fields,
|
|||
|
|
// as it means we do not to store them when they are not present.
|
|||
|
|
struct AhoCorasickEdge {
|
|||
|
|
// char (unsigned, so [0..255]), or a special label below.
|
|||
|
|
uint32_t label : 9;
|
|||
|
|
NodeID node_id : 23;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// Node index that failure edge leads to. The failure node corresponds to
|
|||
|
|
// the node which represents the longest proper suffix (include empty
|
|||
|
|
// string) of the string represented by this node. Not stored if it is
|
|||
|
|
// equal to kRootID (since that is the most common value).
|
|||
|
|
//
|
|||
|
|
// NOTE: Assigning |root| as the failure edge for itself doesn't strictly
|
|||
|
|
// abide by the definition of "proper" suffix. The proper suffix of an empty
|
|||
|
|
// string should probably be defined as null, but we assign it to the |root|
|
|||
|
|
// to simplify the code and have the invariant that the failure edge is always
|
|||
|
|
// defined.
|
|||
|
|
static constexpr uint32_t kFailureNodeLabel = 0x100;
|
|||
|
|
static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;
|
|||
|
|
|
|||
|
|
// Node index that corresponds to the longest proper suffix (including empty
|
|||
|
|
// suffix) of this node and which also represents the end of a pattern.
|
|||
|
|
// Does not have to exist.
|
|||
|
|
static constexpr uint32_t kOutputLinkLabel = 0x101;
|
|||
|
|
|
|||
|
|
// If present, this node represents the end of a pattern. It stores the ID of
|
|||
|
|
// the corresponding pattern (ie., it is not really a NodeID, but a
|
|||
|
|
// MatcherStringPattern::ID).
|
|||
|
|
static constexpr uint32_t kMatchIDLabel = 0x102;
|
|||
|
|
|
|||
|
|
// Used for uninitialized label slots; used so that we do not have to test for
|
|||
|
|
// them in other ways, since we know the data will be initialized and never
|
|||
|
|
// match any other labels.
|
|||
|
|
static constexpr uint32_t kEmptyLabel = 0x103;
|
|||
|
|
|
|||
|
|
// A node in the trie, packed tightly together so that it occupies 12 bytes
|
|||
|
|
// (both on 32- and 64-bit platforms), but aligned to at least 4 (see the
|
|||
|
|
// comment on edges_).
|
|||
|
|
class alignas(AhoCorasickEdge) AhoCorasickNode {
|
|||
|
|
public:
|
|||
|
|
AhoCorasickNode();
|
|||
|
|
~AhoCorasickNode();
|
|||
|
|
AhoCorasickNode(AhoCorasickNode&& other);
|
|||
|
|
AhoCorasickNode& operator=(AhoCorasickNode&& other);
|
|||
|
|
|
|||
|
|
NodeID GetEdge(uint32_t label) const {
|
|||
|
|
if (edges_capacity_ != 0) {
|
|||
|
|
return GetEdgeNoInline(label);
|
|||
|
|
}
|
|||
|
|
static_assert(kNumInlineEdges == 2, "Code below needs updating");
|
|||
|
|
if (edges_.inline_edges[0].label == label) {
|
|||
|
|
return edges_.inline_edges[0].node_id;
|
|||
|
|
}
|
|||
|
|
if (edges_.inline_edges[1].label == label) {
|
|||
|
|
return edges_.inline_edges[1].node_id;
|
|||
|
|
}
|
|||
|
|
return kInvalidNodeID;
|
|||
|
|
}
|
|||
|
|
NodeID GetEdgeNoInline(uint32_t label) const;
|
|||
|
|
void SetEdge(uint32_t label, NodeID node);
|
|||
|
|
const AhoCorasickEdge* edges() const {
|
|||
|
|
// NOTE: Returning edges_.inline_edges here is fine, because it's
|
|||
|
|
// the first thing in the struct (see the comment on edges_).
|
|||
|
|
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) %
|
|||
|
|
alignof(AhoCorasickEdge));
|
|||
|
|
return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
NodeID failure() const {
|
|||
|
|
// NOTE: Even if num_edges_ == 0, we are not doing anything
|
|||
|
|
// undefined, as we will have room for at least two edges
|
|||
|
|
// and empty edges are set to kEmptyLabel.
|
|||
|
|
const AhoCorasickEdge& first_edge = *edges();
|
|||
|
|
if (first_edge.label == kFailureNodeLabel) {
|
|||
|
|
return first_edge.node_id;
|
|||
|
|
} else {
|
|||
|
|
return kRootID;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
void SetFailure(NodeID failure);
|
|||
|
|
|
|||
|
|
void SetMatchID(MatcherStringPattern::ID id) {
|
|||
|
|
DCHECK(!IsEndOfPattern());
|
|||
|
|
DCHECK(id < kInvalidNodeID); // This is enforced by Build().
|
|||
|
|
SetEdge(kMatchIDLabel, static_cast<NodeID>(id));
|
|||
|
|
has_outputs_ = true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Returns true if this node corresponds to a pattern.
|
|||
|
|
bool IsEndOfPattern() const {
|
|||
|
|
if (!has_outputs_) {
|
|||
|
|
// Fast reject.
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
return GetEdge(kMatchIDLabel) != kInvalidNodeID;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Must only be called if |IsEndOfPattern| returns true for this node.
|
|||
|
|
MatcherStringPattern::ID GetMatchID() const {
|
|||
|
|
DCHECK(IsEndOfPattern());
|
|||
|
|
return GetEdge(kMatchIDLabel);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void SetOutputLink(NodeID node) {
|
|||
|
|
if (node != kInvalidNodeID) {
|
|||
|
|
SetEdge(kOutputLinkLabel, node);
|
|||
|
|
has_outputs_ = true;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
NodeID output_link() const { return GetEdge(kOutputLinkLabel); }
|
|||
|
|
|
|||
|
|
size_t EstimateMemoryUsage() const;
|
|||
|
|
size_t num_edges() const {
|
|||
|
|
if (edges_capacity_ == 0) {
|
|||
|
|
return kNumInlineEdges - num_free_edges_;
|
|||
|
|
} else {
|
|||
|
|
return edges_capacity_ - num_free_edges_;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool has_outputs() const { return has_outputs_; }
|
|||
|
|
|
|||
|
|
private:
|
|||
|
|
// Outgoing edges of current node, including failure edge and output links.
|
|||
|
|
// Most nodes have only one or two (or even zero) edges, not the last
|
|||
|
|
// because many of them are leaves. Thus, we make an optimization for this
|
|||
|
|
// common case; instead of a pointer to an edge array on the heap, we can
|
|||
|
|
// pack two edges inline where the pointer would otherwise be. This reduces
|
|||
|
|
// memory usage dramatically, as well as saving us a cache-line fetch.
|
|||
|
|
//
|
|||
|
|
// Note that even though most nodes have fewer outgoing edges, most nodes
|
|||
|
|
// that we actually traverse will have any of them. This apparent
|
|||
|
|
// contradiction is because we tend to spend more of our time near the root
|
|||
|
|
// of the trie, where it is wide. This means that another layout would be
|
|||
|
|
// possible: If we wanted to, non-inline nodes could simply store an array
|
|||
|
|
// of 259 (256 possible characters plus the three special label types)
|
|||
|
|
// edges, indexed directly by label type. This would use 20–50% more RAM,
|
|||
|
|
// but also increases the speed of lookups due to removing the search loop.
|
|||
|
|
//
|
|||
|
|
// The nodes are generally unordered; since we typically index text, even
|
|||
|
|
// the root will rarely be more than 20–30 wide, and at that point, it's
|
|||
|
|
// better to just do a linear search than a binary one (which fares poorly
|
|||
|
|
// on branch predictors). However, a special case, we put kFailureNodeLabel
|
|||
|
|
// in the first slot if it exists (ie., is not equal to kRootID), since we
|
|||
|
|
// need to access that label during every single node we look at during
|
|||
|
|
// traversal.
|
|||
|
|
//
|
|||
|
|
// NOTE: Keep this the first member in the struct, so that inline_edges gets
|
|||
|
|
// 4-aligned (since the class is marked as such, despite being packed.
|
|||
|
|
// Otherwise, edges() can return an unaligned pointer marked as aligned
|
|||
|
|
// (the unalignedness gets lost).
|
|||
|
|
static constexpr int kNumInlineEdges = 2;
|
|||
|
|
union {
|
|||
|
|
// Out-of-line edge storage, having room for edges_capacity_ elements.
|
|||
|
|
// Note that due to __attribute__((packed)) below, this pointer may be
|
|||
|
|
// unaligned on 64-bit platforms, causing slightly less efficient
|
|||
|
|
// access to it in some cases.
|
|||
|
|
// This field is not a raw_ptr<> because it was filtered by the rewriter
|
|||
|
|
// for: #union
|
|||
|
|
RAW_PTR_EXCLUSION AhoCorasickEdge* edges;
|
|||
|
|
|
|||
|
|
// Inline edge storage, used if edges_capacity_ == 0.
|
|||
|
|
AhoCorasickEdge inline_edges[kNumInlineEdges];
|
|||
|
|
} edges_;
|
|||
|
|
|
|||
|
|
// Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
|
|||
|
|
// ie., hitting this node during traversal will create one or more
|
|||
|
|
// matches. This is redundant, but since every single lookup during
|
|||
|
|
// traversal needs this, it saves a few searches for us.
|
|||
|
|
bool has_outputs_ = false;
|
|||
|
|
|
|||
|
|
// Number of unused left in edges_. Edges are always allocated from the
|
|||
|
|
// beginning and never deleted; those after num_edges_ will be marked with
|
|||
|
|
// kEmptyLabel (and have an undefined node_id). We store the number of
|
|||
|
|
// free edges instead of the more common number of _used_ edges, to be
|
|||
|
|
// sure that we are able to fit it in an uint8_t. num_edges() provides
|
|||
|
|
// a useful abstraction over this.
|
|||
|
|
uint8_t num_free_edges_ = kNumInlineEdges;
|
|||
|
|
|
|||
|
|
// How many edges we have allocated room for (can never be more than
|
|||
|
|
// kEmptyLabel + 1). If equal to zero, we are not using heap storage,
|
|||
|
|
// but instead are using inline_edges.
|
|||
|
|
//
|
|||
|
|
// If not equal to zero, will be a multiple of 4, so that we can use
|
|||
|
|
// SIMD to accelerate looking for edges.
|
|||
|
|
uint16_t edges_capacity_ = 0;
|
|||
|
|
} __attribute__((packed));
|
|||
|
|
|
|||
|
|
using SubstringPatternVector = std::vector<const MatcherStringPattern*>;
|
|||
|
|
|
|||
|
|
// Given the set of patterns, compute how many nodes will the corresponding
|
|||
|
|
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
|
|||
|
|
NodeID GetTreeSize(
|
|||
|
|
const std::vector<const MatcherStringPattern*>& patterns) const;
|
|||
|
|
|
|||
|
|
void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
|
|||
|
|
|
|||
|
|
// Inserts a path for |pattern->pattern()| into the tree and adds
|
|||
|
|
// |pattern->id()| to the set of matches.
|
|||
|
|
void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern);
|
|||
|
|
|
|||
|
|
void CreateFailureAndOutputEdges();
|
|||
|
|
|
|||
|
|
// Adds all pattern IDs to |matches| which are a suffix of the string
|
|||
|
|
// represented by |node|.
|
|||
|
|
void AccumulateMatchesForNode(
|
|||
|
|
const AhoCorasickNode* node,
|
|||
|
|
std::set<MatcherStringPattern::ID>* matches) const;
|
|||
|
|
|
|||
|
|
// The nodes of a Aho-Corasick tree.
|
|||
|
|
std::vector<AhoCorasickNode> tree_;
|
|||
|
|
|
|||
|
|
bool is_empty_ = true;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
} // namespace base
|
|||
|
|
|
|||
|
|
#endif // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
|