blob: f5a9cb57722e042564a69af4fbf0d5540178a596 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import re
import sqlparse
from sqlparse.tokens import Name
from collections import defaultdict
from .pgliterals.main import get_literals
white_space_regex = re.compile("\\s+", re.MULTILINE)
def _compile_regex(keyword):
# Surround the keyword with word boundaries and replace interior whitespace
# with whitespace wildcards
pattern = "\\b" + white_space_regex.sub(r"\\s+", keyword) + "\\b"
return re.compile(pattern, re.MULTILINE | re.IGNORECASE)
keywords = get_literals("keywords")
keyword_regexs = {kw: _compile_regex(kw) for kw in keywords}
class PrevalenceCounter:
def __init__(self):
self.keyword_counts = defaultdict(int)
self.name_counts = defaultdict(int)
def update(self, text):
self.update_keywords(text)
self.update_names(text)
def update_names(self, text):
for parsed in sqlparse.parse(text):
for token in parsed.flatten():
if token.ttype in Name:
self.name_counts[token.value] += 1
def clear_names(self):
self.name_counts = defaultdict(int)
def update_keywords(self, text):
# Count keywords. Can't rely for sqlparse for this, because it's
# database agnostic
for keyword, regex in keyword_regexs.items():
for _ in regex.finditer(text):
self.keyword_counts[keyword] += 1
def keyword_count(self, keyword):
return self.keyword_counts[keyword]
def name_count(self, name):
return self.name_counts[name]
|