GreynirServer/search.py at master · mideind/GreynirServer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""

    Greynir: Natural language processing for Icelandic

    Search module

    Copyright (C) 2023 Miðeind ehf.
    Original author: Vilhjálmur Þorsteinsson

       This program is free software: you can redistribute it and/or modify
       it under the terms of the GNU General Public License as published by
       the Free Software Foundation, either version 3 of the License, or
       (at your option) any later version.
       This program is distributed in the hope that it will be useful,
       but WITHOUT ANY WARRANTY; without even the implied warranty of
       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see http://www.gnu.org/licenses/.


    This module implements a search mechanism. The Search class parses
    a search string into list of word stems and creates a topic vector from it,
    which is then used in a similarity query to find related articles.

"""

from typing import Iterable, Iterator, Optional, List, Tuple, TypedDict

from datetime import datetime, timedelta

from settings import Settings
from db import Session
from db.models import Root, Article
from similar import SimilarityClient


class SimilarDict(TypedDict):
    """Typed dictionary for the result of a similarity query"""

    heading: str
    url: str
    uuid: str
    domain: str
    ts: datetime
    ts_text: str
    similarity: float


class WeightsDict(TypedDict):
    """Typed dictionary for the result of a similarity query"""

    weights: List[float]
    articles: List[SimilarDict]


class Search:

    """This class wraps search queries to the similarity server
    via the similarity client."""

    def __init__(self) -> None:
        """This class is normally not instantiated"""
        pass

    @classmethod
    def _new_client(cls) -> SimilarityClient:
        """Create a new similarity client for each request to avoid
        connection sharing issues between concurrent greenlets/threads"""
        return SimilarityClient()

    @classmethod
    def list_similar_to_article(
        cls, session: Session, uuid: str, n: int
    ) -> Tuple[List[SimilarDict], bool]:
        """List n articles that are similar to the article with the given id.
        Returns a tuple of (similar_articles, not_indexed) where not_indexed
        is True if the article has not yet been indexed for similarity."""
        client = cls._new_client()
        try:
            result = client.list_similar_to_article(uuid, n=n + 5)
            not_indexed: bool = result.get("not_indexed", False)
            articles: List[Tuple[str, float]] = result.get("articles", [])
            return cls.list_articles(session, articles, n), not_indexed
        finally:
            client.close()

    @classmethod
    def list_similar_to_topic(
        cls, session: Session, topic_vector: List[float], n: int
    ) -> List[SimilarDict]:
        """List n articles that are similar to the given topic vector"""
        client = cls._new_client()
        try:
            result = client.list_similar_to_topic(topic_vector, n=n + 5)
            articles: List[Tuple[str, float]] = result.get("articles", [])
            return cls.list_articles(session, articles, n)
        finally:
            client.close()

    @classmethod
    def list_similar_to_terms(
        cls, session: Session, terms: List[Tuple[str, str]], n: int
    ) -> WeightsDict:
        """List n articles that are similar to the given terms. The
        terms are expected to be a list of (stem, category) tuples."""
        client = cls._new_client()
        try:
            result = client.list_similar_to_terms(terms, n=n + 5)
            articles: List[Tuple[str, float]] = result.get("articles", [])
            weights: List[float] = result.get("weights", [])
            return WeightsDict(
                weights=weights, articles=cls.list_articles(session, articles, n)
            )
        finally:
            client.close()

    @classmethod
    def list_articles(
        cls, session: Session, result: Iterable[Tuple[str, float]], n: int
    ) -> List[SimilarDict]:
        """Convert similarity result tuples into article descriptors"""
        similar: List[SimilarDict] = []
        for sid, similarity in result:
            if similarity > 0.9999:
                # The original article (or at least a verbatim copy of it)
                continue
            q = session.query(Article).join(Root).filter(Article.id == sid)
            sa: Optional[Article] = q.one_or_none()
            if sa is None:
                # Article not found
                continue
            if not sa.heading:
                # Skip articles without headings
                continue
            # Similarity in percent
            spercent = 100.0 * similarity

            assert sa.timestamp is not None  # Silence type checker

            def is_probably_same_as(last: SimilarDict) -> bool:
                """Return True if the current article is probably different from
                the one already described in the last object"""
                assert sa is not None
                if last["domain"] != sa.root.domain:
                    # Another root domain: can't be the same content
                    return False
                assert sa.timestamp is not None
                if abs(last["ts"] - sa.timestamp) > timedelta(minutes=10):
                    # More than 10 minutes timestamp difference
                    return False
                # Quite similar: probably the same article
                ratio = spercent / last["similarity"]
                if ratio > 0.993:
                    if Settings.DEBUG:
                        print(
                            "Rejecting {0}, domain {1}, ts {2} because of similarity with {3},"
                            " {4}, {5}; ratio is {6:.3f}".format(
                                sa.heading,
                                sa.root.domain,
                                sa.timestamp,
                                last["heading"],
                                last["domain"],
                                last["ts"],
                                ratio,
                            )
                        )
                    return True
                return False

            def gen_similar() -> Iterator[Tuple[int, SimilarDict]]:
                """Generate the entries in the result list that are probably
                the same as the one we are considering"""
                for ix, p in enumerate(similar):
                    if is_probably_same_as(p):
                        yield (ix, p)

            d = SimilarDict(
                heading=sa.heading,
                url=sa.url,
                uuid=sid,
                domain=sa.root.domain,
                ts=sa.timestamp,
                ts_text=sa.timestamp.isoformat()[0:10],
                similarity=spercent,
            )
            # Don't add another article with practically the same similarity
            # as the previous one, as it is very probably a duplicate
            same = next(gen_similar(), None)
            if same is None:
                # No similar article
                similar.append(d)
                if len(similar) == n:
                    # Enough articles: we're done
                    break
            elif d["ts"] > same[1]["ts"]:
                # Similar article, and the one we're considering is
                # newer: replace the one in the list
                if Settings.DEBUG:
                    print("Replacing: {0} ({1:.2f})".format(sa.heading, spercent))
                similar[same[0]] = d
            else:
                # Similar article, and the previous one is newer:
                # drop the one we're considering
                if Settings.DEBUG:
                    print("Ignoring: {0} ({1:.2f})".format(sa.heading, spercent))
                pass

        if Settings.DEBUG and similar:
            print(
                "Similar list is:\n   {0}".format("\n   ".join(str(s) for s in similar))
            )
        return similar