-
Notifications
You must be signed in to change notification settings - Fork 83
Expand file tree
/
Copy pathrss.go
More file actions
378 lines (315 loc) · 9.79 KB
/
rss.go
File metadata and controls
378 lines (315 loc) · 9.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
/*
Package rss is a small library for simplifying the parsing of RSS and Atom feeds.
The package conforms to the [RSS 1.0], [RSS 2.0], and [Atom 1.0] specifications.
If you encounter any problems with feeds being parsed incorrectly, please
open an issue on GitHub.
Example usage:
package main
import (
"context"
"github.com/SlyMarbo/rss/v2"
)
func main() {
ctx := context.Background()
reader, err := rss.NewReader()
if err != nil {
// handle error.
}
feed, err := reader.Fetch(ctx, "https://example.com/rss")
if err != nil {
// handle error.
}
// ... Some time later ...
err = reader.UpdatePatiently(ctx, feed)
if err != nil {
// handle error.
}
}
The library does its best to follow the appropriate specifications and not to set the `NextUpdate` time
too soon. It currently follows all update time management methods in the RSS 1.0, 2.0, and Atom 1.0
specifications. If one is not provided, it defaults to 12 hour intervals. If you are having issues
with feed providors dropping connections, use `WithDefaultUpdateInterval` to set the default update
interval.
The project is not proactively maintained, but I'll respond to issues and PRs as soon as I can.
[Atom 1.0]: https://datatracker.ietf.org/doc/html/rfc4287
[RSS 1.0]: https://web.resource.org/rss/1.0/spec
[RSS 2.0]: https://www.rssboard.org/rss-specification
*/
package rss
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"slices"
"strings"
"time"
)
// ErrTooSoon indicates that a feed cannot be updated
// yet, as the desired interval between updates has
// not yet elapsed.
var ErrTooSoon = errors.New("rss: too soon to update feed")
// Reader stores the configuration data used to fetch,
// parse, and update RSS feeds. Reader supports RSS 2.0,
// RSS 1.0, and Atom 1.0. A single Reader can process
// many different feeds.
type Reader struct {
// General config.
time func() time.Time // Optional override on the current time.
slog *slog.Logger
// Networking config.
http *http.Client
// Parsing config.
defaultUpdateInterval time.Duration // Minimum time before updating a feed that hasn't specified an interval.
timeLayouts []string
timeLayoutsWithNamedLocation []string
}
// NewReader is used to configure a [Reader]. The
// resulting reader will start with sensible defaults,
// which are then updated by any provided options.
//
// NewReader will only return an error if one of the
// provided options fails.
func NewReader(options ...Option) (*Reader, error) {
r := &Reader{
// General config.
slog: slog.New(slog.DiscardHandler), // Default to no logging.
// Networking config.
http: http.DefaultClient,
// Parsing config.
defaultUpdateInterval: 12 * time.Hour,
timeLayouts: slices.Clone(defaultTimeLayouts),
timeLayoutsWithNamedLocation: slices.Clone(defaultTimeLayoutsWithNamedLocation),
}
for _, option := range options {
err := option(r)
if err != nil {
return nil, err
}
}
return r, nil
}
// now returns the current time.
func (r *Reader) now() time.Time {
if r.time == nil {
return time.Now()
}
return r.time()
}
// debug records the given log message at the debug
// level.
func (r *Reader) debug(ctx context.Context, msg string, attrs ...slog.Attr) {
r.slog.LogAttrs(ctx, slog.LevelDebug, msg, attrs...)
}
// info records the given log message at the info
// level.
func (r *Reader) info(ctx context.Context, msg string, attrs ...slog.Attr) {
r.slog.LogAttrs(ctx, slog.LevelInfo, msg, attrs...)
}
// warn records the given log message at the warn
// level.
func (r *Reader) warn(ctx context.Context, msg string, attrs ...slog.Attr) {
r.slog.LogAttrs(ctx, slog.LevelWarn, msg, attrs...)
}
// error records the given log message at the error
// level.
func (r *Reader) error(ctx context.Context, msg string, attrs ...slog.Attr) {
r.slog.LogAttrs(ctx, slog.LevelError, msg, attrs...)
}
// Fetch downloads and parses the RSS/Atom feed at
// the given URL.
func (r *Reader) Fetch(ctx context.Context, url string) (*Feed, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, fmt.Errorf("failed to construct GET request: %v", err)
}
res, err := r.http.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to request feed: %v", err)
}
feed, err := r.Parse(url, res.Body)
if err != nil {
io.Copy(io.Discard, res.Body)
res.Body.Close()
return nil, fmt.Errorf("failed to parse feed: %v", err)
}
err = res.Body.Close()
if err != nil {
return nil, fmt.Errorf("failed to close response body: %v", err)
}
return feed, nil
}
// Parse RSS or Atom data.
func (r *Reader) Parse(url string, data io.Reader) (*Feed, error) {
prefix, data, err := readStringPrefix(data, 512)
if err != nil {
return nil, err
}
if strings.Contains(prefix, "<rss") {
return r.parseRSS2(url, data)
}
if strings.Contains(prefix, `xmlns="http://purl.org/rss/1.0/"`) {
return r.parseRSS1(url, data)
}
return r.parseAtom1(url, data)
}
// Update attempts to update the feed with new items.
// If it is too soon to update the feed, Update will
// return immediately with [ErrTooSoon].
//
// Callers that would rather wait until an update can
// be performed should instead use [Reader.UpdatePatiently].
func (r *Reader) Update(ctx context.Context, feed *Feed) error {
if feed.UpdateURL == "" {
return errors.New("feed has no update URL")
}
// Check whether enough time has elapsed since
// the last update.
if feed.NextUpdate.After(r.now()) {
return ErrTooSoon
}
return r.update(ctx, feed)
}
// UpdatePatiently attempts to update the feed with
// new items. If it is too soon to update the feed,
// UpdatePatiently will wait until the chosen interval
// has elapsed and then attempt an update. If the
// given context expires while UpdatePatiently is
// waiting, it will return with the context's error.
//
// Callers that would rather return immediately if it
// is too soon to perform an update should instead use
// [Reader.Update].
func (r *Reader) UpdatePatiently(ctx context.Context, feed *Feed) error {
if feed.UpdateURL == "" {
return errors.New("feed has no update URL")
}
err := r.waitForFeed(ctx, feed)
if err != nil {
return err
}
return r.update(ctx, feed)
}
func (r *Reader) waitForFeed(ctx context.Context, feed *Feed) error {
// Check whether enough time has elapsed since
// the last update.
if now := r.now(); feed.NextUpdate.After(now) {
interval := feed.NextUpdate.Sub(now)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(interval):
return nil
}
}
return nil
}
// update performs an update to a feed. It assumes
// an appropriate interval has passed since the last
// update.
func (r *Reader) update(ctx context.Context, feed *Feed) error {
seen := make(map[string]struct{})
for _, item := range feed.Items {
seen[item.ID] = struct{}{}
}
update, err := r.Fetch(ctx, feed.UpdateURL)
if err != nil {
return fmt.Errorf("failed to update feed: %v", err)
}
feed.NextUpdate = update.NextUpdate
feed.Title = update.Title
feed.Description = update.Description
for _, item := range update.Items {
if _, ok := seen[item.ID]; !ok {
seen[item.ID] = struct{}{}
feed.Items = append(feed.Items, item)
feed.Unread++
}
}
return nil
}
// Option represents a function that can be used to
// configure a [Reader]. Options are typically used
// with [NewReader].
type Option func(*Reader) error
// General config.
// WithNow configures the reader to use the given
// callback to determine the current time.
//
// By default, readers use [time.Now].
func WithNow(now func() time.Time) Option {
return func(r *Reader) error {
r.time = now
return nil
}
}
// WithLogger configures the reader to use the given
// structured logger.
func WithLogger(logger *slog.Logger) Option {
return func(r *Reader) error {
r.slog = logger
return nil
}
}
// Networking config.
// WithHTTPClient configures the reader to use the given
// HTTP client when fetching or updating feeds.
func WithHTTPClient(client *http.Client) Option {
return func(r *Reader) error {
r.http = client
return nil
}
}
// Parsing config.
// WithDefaultUpdateInterval configures the reader
// to use the given minimum interval between updates
// if the feed has not specified its interval.
func WithDefaultUpdateInterval(interval time.Duration) Option {
return func(r *Reader) error {
r.defaultUpdateInterval = interval
return nil
}
}
// WithExtraTimeLayouts configures the reader to
// use the given time layouts in addition to the
// built-in layouts when tyring to parse timestamps.
//
// These layouts must not use names like `"EST"` to
// represent time zones. If the layouts do use named
// locations, use [WithExtraTimeLayoutsWithNamedLocation]
// instead.
func WithExtraTimeLayouts(layouts ...string) Option {
return func(r *Reader) error {
// Check these don't use named layouts.
for _, layout := range layouts {
if strings.Contains(layout, "MST") {
return fmt.Errorf("cannot use layout %q with WithExtraTimeLayouts: has named layout", layout)
}
}
r.timeLayouts = append(r.timeLayouts, layouts...)
return nil
}
}
// WithExtraTimeLayoutsWithNamedLocation configures
// the reader to use the given time layouts in
// addition to the built-in layouts when tyring to
// parse timestamps.
//
// These layouts can use names like `"EST"` to
// represent time zones. If the layouts do not use
// named locations, use [WithExtraTimeLayouts]
// instead.
func WithExtraTimeLayoutsWithNamedLocation(layouts ...string) Option {
return func(r *Reader) error {
// Check these use named layouts.
for _, layout := range layouts {
if !strings.Contains(layout, "MST") {
return fmt.Errorf("cannot use layout %q with WithExtraTimeLayoutsWithNamedLocation: missing named layout", layout)
}
}
r.timeLayoutsWithNamedLocation = append(r.timeLayoutsWithNamedLocation, layouts...)
return nil
}
}