diff options
Diffstat (limited to 'clang-r353983/include/llvm/Support/TrigramIndex.h')
| -rw-r--r-- | clang-r353983/include/llvm/Support/TrigramIndex.h | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/clang-r353983/include/llvm/Support/TrigramIndex.h b/clang-r353983/include/llvm/Support/TrigramIndex.h new file mode 100644 index 00000000..9351c2db --- /dev/null +++ b/clang-r353983/include/llvm/Support/TrigramIndex.h @@ -0,0 +1,69 @@ +//===-- TrigramIndex.h - a heuristic for SpecialCaseList --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//===----------------------------------------------------------------------===// +// +// TrigramIndex implements a heuristic for SpecialCaseList that allows to +// filter out ~99% incoming queries when all regular expressions in the +// SpecialCaseList are simple wildcards with '*' and '.'. If rules are more +// complicated, the check is defeated and it will always pass the queries to a +// full regex. +// +// The basic idea is that in order for a wildcard to match a query, the query +// needs to have all trigrams which occur in the wildcard. We create a trigram +// index (trigram -> list of rules with it) and then count trigrams in the query +// for each rule. If the count for one of the rules reaches the expected value, +// the check passes the query to a regex. If none of the rules got enough +// trigrams, the check tells that the query is definitely not matched by any +// of the rules, and no regex matching is needed. +// A similar idea was used in Google Code Search as described in the blog post: +// https://swtch.com/~rsc/regexp/regexp4.html +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_TRIGRAMINDEX_H +#define LLVM_SUPPORT_TRIGRAMINDEX_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" + +#include <string> +#include <unordered_map> +#include <vector> + +namespace llvm { +class StringRef; + +class TrigramIndex { + public: + /// Inserts a new Regex into the index. + void insert(std::string Regex); + + /// Returns true, if special case list definitely does not have a line + /// that matches the query. Returns false, if it's not sure. + bool isDefinitelyOut(StringRef Query) const; + + /// Returned true, iff the heuristic is defeated and not useful. + /// In this case isDefinitelyOut always returns false. + bool isDefeated() { return Defeated; } + private: + // If true, the rules are too complicated for the check to work, and full + // regex matching is needed for every rule. + bool Defeated = false; + // The minimum number of trigrams which should match for a rule to have a + // chance to match the query. The number of elements equals the number of + // regex rules in the SpecialCaseList. + std::vector<unsigned> Counts; + // Index holds a list of rules indices for each trigram. The same indices + // are used in Counts to store per-rule limits. + // If a trigram is too common (>4 rules with it), we stop tracking it, + // which increases the probability for a need to match using regex, but + // decreases the costs in the regular case. + std::unordered_map<unsigned, SmallVector<size_t, 4>> Index{256}; +}; + +} // namespace llvm + +#endif // LLVM_SUPPORT_TRIGRAMINDEX_H |
