-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathsearch.py
More file actions
executable file
·214 lines (183 loc) · 8.05 KB
/
search.py
File metadata and controls
executable file
·214 lines (183 loc) · 8.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
Greynir: Natural language processing for Icelandic
Search module
Copyright (C) 2023 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
This module implements a search mechanism. The Search class parses
a search string into list of word stems and creates a topic vector from it,
which is then used in a similarity query to find related articles.
"""
from typing import Iterable, Iterator, Optional, List, Tuple, TypedDict
from datetime import datetime, timedelta
from settings import Settings
from db import Session
from db.models import Root, Article
from similar import SimilarityClient
class SimilarDict(TypedDict):
"""Typed dictionary for the result of a similarity query"""
heading: str
url: str
uuid: str
domain: str
ts: datetime
ts_text: str
similarity: float
class WeightsDict(TypedDict):
"""Typed dictionary for the result of a similarity query"""
weights: List[float]
articles: List[SimilarDict]
class Search:
"""This class wraps search queries to the similarity server
via the similarity client."""
def __init__(self) -> None:
"""This class is normally not instantiated"""
pass
@classmethod
def _new_client(cls) -> SimilarityClient:
"""Create a new similarity client for each request to avoid
connection sharing issues between concurrent greenlets/threads"""
return SimilarityClient()
@classmethod
def list_similar_to_article(
cls, session: Session, uuid: str, n: int
) -> Tuple[List[SimilarDict], bool]:
"""List n articles that are similar to the article with the given id.
Returns a tuple of (similar_articles, not_indexed) where not_indexed
is True if the article has not yet been indexed for similarity."""
client = cls._new_client()
try:
result = client.list_similar_to_article(uuid, n=n + 5)
not_indexed: bool = result.get("not_indexed", False)
articles: List[Tuple[str, float]] = result.get("articles", [])
return cls.list_articles(session, articles, n), not_indexed
finally:
client.close()
@classmethod
def list_similar_to_topic(
cls, session: Session, topic_vector: List[float], n: int
) -> List[SimilarDict]:
"""List n articles that are similar to the given topic vector"""
client = cls._new_client()
try:
result = client.list_similar_to_topic(topic_vector, n=n + 5)
articles: List[Tuple[str, float]] = result.get("articles", [])
return cls.list_articles(session, articles, n)
finally:
client.close()
@classmethod
def list_similar_to_terms(
cls, session: Session, terms: List[Tuple[str, str]], n: int
) -> WeightsDict:
"""List n articles that are similar to the given terms. The
terms are expected to be a list of (stem, category) tuples."""
client = cls._new_client()
try:
result = client.list_similar_to_terms(terms, n=n + 5)
articles: List[Tuple[str, float]] = result.get("articles", [])
weights: List[float] = result.get("weights", [])
return WeightsDict(
weights=weights, articles=cls.list_articles(session, articles, n)
)
finally:
client.close()
@classmethod
def list_articles(
cls, session: Session, result: Iterable[Tuple[str, float]], n: int
) -> List[SimilarDict]:
"""Convert similarity result tuples into article descriptors"""
similar: List[SimilarDict] = []
for sid, similarity in result:
if similarity > 0.9999:
# The original article (or at least a verbatim copy of it)
continue
q = session.query(Article).join(Root).filter(Article.id == sid)
sa: Optional[Article] = q.one_or_none()
if sa is None:
# Article not found
continue
if not sa.heading:
# Skip articles without headings
continue
# Similarity in percent
spercent = 100.0 * similarity
assert sa.timestamp is not None # Silence type checker
def is_probably_same_as(last: SimilarDict) -> bool:
"""Return True if the current article is probably different from
the one already described in the last object"""
assert sa is not None
if last["domain"] != sa.root.domain:
# Another root domain: can't be the same content
return False
assert sa.timestamp is not None
if abs(last["ts"] - sa.timestamp) > timedelta(minutes=10):
# More than 10 minutes timestamp difference
return False
# Quite similar: probably the same article
ratio = spercent / last["similarity"]
if ratio > 0.993:
if Settings.DEBUG:
print(
"Rejecting {0}, domain {1}, ts {2} because of similarity with {3},"
" {4}, {5}; ratio is {6:.3f}".format(
sa.heading,
sa.root.domain,
sa.timestamp,
last["heading"],
last["domain"],
last["ts"],
ratio,
)
)
return True
return False
def gen_similar() -> Iterator[Tuple[int, SimilarDict]]:
"""Generate the entries in the result list that are probably
the same as the one we are considering"""
for ix, p in enumerate(similar):
if is_probably_same_as(p):
yield (ix, p)
d = SimilarDict(
heading=sa.heading,
url=sa.url,
uuid=sid,
domain=sa.root.domain,
ts=sa.timestamp,
ts_text=sa.timestamp.isoformat()[0:10],
similarity=spercent,
)
# Don't add another article with practically the same similarity
# as the previous one, as it is very probably a duplicate
same = next(gen_similar(), None)
if same is None:
# No similar article
similar.append(d)
if len(similar) == n:
# Enough articles: we're done
break
elif d["ts"] > same[1]["ts"]:
# Similar article, and the one we're considering is
# newer: replace the one in the list
if Settings.DEBUG:
print("Replacing: {0} ({1:.2f})".format(sa.heading, spercent))
similar[same[0]] = d
else:
# Similar article, and the previous one is newer:
# drop the one we're considering
if Settings.DEBUG:
print("Ignoring: {0} ({1:.2f})".format(sa.heading, spercent))
pass
if Settings.DEBUG and similar:
print(
"Similar list is:\n {0}".format("\n ".join(str(s) for s in similar))
)
return similar