Skip to content

Commit 0c290bd

Browse files
authored
Add alerts (Slack) (#14)
1 parent 87afa59 commit 0c290bd

3 files changed

Lines changed: 220 additions & 2 deletions

File tree

src/ps_helper/extensions/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@ Place the extension in your Scrapy project and enable it in `settings.py`:
2323
```python
2424
EXTENSIONS = {
2525
'ps_helper.extensions.metrics_extension.MetricsExtension': 500,
26+
'ps_helper.extensions.slack_extension.SlackAlertExtension': 600,
2627
}
28+
29+
SLACK_WEBHOOK_URL = 'url_here'
2730
```
2831

2932
Optionally configure the number of timeline buckets:
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
import requests
2+
import os
3+
import datetime
4+
from scrapy import signals
5+
6+
7+
class EstelaSlackAlerts:
8+
def __init__(self, webhook_url):
9+
self.webhook_url = webhook_url
10+
11+
@classmethod
12+
def from_crawler(cls, crawler):
13+
webhook_url = crawler.settings.get('SLACK_WEBHOOK_URL')
14+
if not webhook_url:
15+
return None
16+
17+
ext = cls(webhook_url)
18+
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
19+
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
20+
return ext
21+
22+
def spider_opened(self, spider):
23+
self.start_time = datetime.datetime.now()
24+
25+
def spider_closed(self, spider, reason):
26+
stats = spider.crawler.stats.get_stats()
27+
28+
# --- 1. Basic Metrics ---
29+
items = stats.get('item_scraped_count', 0)
30+
items_expected = getattr(spider, "ITEMS_EXPECTED", 0)
31+
responses = stats.get('downloader/response_count', 0)
32+
33+
# --- 2. Network & Error Breakdown ---
34+
status_200 = stats.get('downloader/response_status_count/200', 0)
35+
err_403 = stats.get('downloader/response_status_count/403', 0)
36+
err_407 = stats.get('downloader/response_status_count/407', 0)
37+
err_429 = stats.get('downloader/response_status_count/429', 0)
38+
err_500 = stats.get('downloader/response_status_count/500', 0)
39+
err_503 = stats.get('downloader/response_status_count/503', 0)
40+
tunnel_errors = stats.get('downloader/exception_type_count/twisted.internet.error.TimeoutError', 0)
41+
log_errors = stats.get('log_count/ERROR', 0)
42+
43+
server_errors = err_500 + err_503
44+
45+
# --- 3. Rates Calculation ---
46+
http_success_rate = (status_200 / responses * 100) if responses > 0 else 0.0
47+
goal_achievement = (items / items_expected * 100) if items_expected > 0 else None
48+
49+
# Efficiency Factor
50+
req_per_item = responses / items if items > 0 else float('inf')
51+
if req_per_item <= 3:
52+
efficiency_factor = 1.0
53+
elif req_per_item <= 4:
54+
efficiency_factor = 0.95
55+
elif req_per_item <= 5:
56+
efficiency_factor = 0.90
57+
elif req_per_item <= 7:
58+
efficiency_factor = 0.80
59+
else:
60+
efficiency_factor = 0.65
61+
62+
# Overall Success Rate
63+
if goal_achievement is not None:
64+
success_rate = ((goal_achievement * 0.7) + (http_success_rate * 0.3)) * efficiency_factor
65+
else:
66+
success_rate = http_success_rate * efficiency_factor
67+
68+
success_rate = min(100.0, max(0.0, success_rate))
69+
70+
# --- 4. Alert Triggers ---
71+
low_yield = items_expected > 0 and items < items_expected
72+
zero_items = items == 0
73+
bad_exit = reason not in ['finished', 'closespider_itemcount']
74+
high_error_rate = log_errors > (responses * 0.5) if responses > 0 else False
75+
network_issues = (err_403 + err_407 + err_429 + tunnel_errors + server_errors) > 10
76+
77+
if any([low_yield, zero_items, bad_exit, high_error_rate, network_issues]):
78+
79+
# Duration Calculation
80+
finish_time = datetime.datetime.now()
81+
if hasattr(self, 'start_time'):
82+
duration_td = finish_time - self.start_time
83+
duration = str(duration_td).split('.')[0]
84+
else:
85+
duration = "N/A"
86+
87+
# Build Anomalies List
88+
alert_reasons = []
89+
if zero_items:
90+
alert_reasons.append("• *Critical:* No items were extracted.")
91+
elif low_yield:
92+
alert_reasons.append(f"• *Low Yield:* Only {items}/{items_expected} items scraped.")
93+
if bad_exit:
94+
alert_reasons.append(f"• *Abnormal Exit:* Reason `{reason}`.")
95+
if network_issues:
96+
alert_reasons.append("• *Network Degradation:* High number of proxy bans or timeouts.")
97+
if high_error_rate and not zero_items:
98+
alert_reasons.append(f"• *High Error Rate:* {log_errors} general errors detected.")
99+
100+
anomalies_text = "\n".join(alert_reasons)
101+
goal_text = f"{round(goal_achievement, 2)}%" if goal_achievement is not None else "N/A"
102+
103+
# --- 5. Estela Environment Variables & URL Builder ---
104+
estela_project_id = os.getenv("ESTELA_PROJECT_ID")
105+
estela_spider_job = os.getenv("ESTELA_SPIDER_JOB")
106+
107+
job_id = "N/A"
108+
spider_id = "N/A"
109+
job_url = None
110+
111+
# Parse ESTELA_SPIDER_JOB (Format usually: project_id.spider_id.job_id)
112+
if estela_spider_job:
113+
parts = estela_spider_job.split(".")
114+
job_id = parts[-1]
115+
if len(parts) >= 2:
116+
spider_id = parts[-2]
117+
118+
# Build URL if running inside Estela
119+
if estela_project_id and spider_id != "N/A" and job_id != "N/A":
120+
base_url = "https://hetzner-staging.bitmaker.dev"
121+
job_url = f"{base_url}/projects/{estela_project_id}/spiders/{spider_id}/jobs/{job_id}"
122+
123+
# --- 6. Dynamic Network Fields ---
124+
network_fields = [
125+
{"type": "mrkdwn", "text": f"*200 (OK):*\n{status_200}"}
126+
]
127+
128+
if tunnel_errors > 0:
129+
network_fields.append({"type": "mrkdwn", "text": f"*Tunnel (Timeouts):*\n{tunnel_errors}"})
130+
if err_403 > 0:
131+
network_fields.append({"type": "mrkdwn", "text": f"*403 (Forbidden):*\n{err_403}"})
132+
if err_407 > 0:
133+
network_fields.append({"type": "mrkdwn", "text": f"*407 (Proxy Auth):*\n{err_407}"})
134+
if err_429 > 0:
135+
network_fields.append({"type": "mrkdwn", "text": f"*429 (Rate Limit):*\n{err_429}"})
136+
if server_errors > 0:
137+
network_fields.append({"type": "mrkdwn", "text": f"*50x (Server Errors):*\n{server_errors}"})
138+
if log_errors > 0:
139+
network_fields.append({"type": "mrkdwn", "text": f"*Log Errors:*\n{log_errors}"})
140+
141+
# Alignment Spacer
142+
if len(network_fields) % 2 != 0:
143+
network_fields.append({"type": "mrkdwn", "text": " "})
144+
145+
network_title = "🌐 *Network & Errors Breakdown*" if len(network_fields) > 2 else "🌐 *Network Traffic*"
146+
147+
# --- 7. Construct Final Slack Blocks ---
148+
blocks = [
149+
{
150+
"type": "header",
151+
"text": {"type": "plain_text", "text": "Spider Quality Alert", "emoji": True}
152+
},
153+
{
154+
"type": "section",
155+
"text": {"type": "mrkdwn", "text": f"*Spider:* `{spider.name}`\n*Anomalies Detected:*\n{anomalies_text}"}
156+
},
157+
{"type": "divider"},
158+
{
159+
"type": "section",
160+
"text": {"type": "mrkdwn", "text": "📊 *Key Performance Indicators*"},
161+
"fields": [
162+
{"type": "mrkdwn", "text": f"*Success Rate:*\n{round(success_rate, 2)}%"},
163+
{"type": "mrkdwn", "text": f"*HTTP Success:*\n{round(http_success_rate, 2)}%"},
164+
{"type": "mrkdwn", "text": f"*Goal Achieved:*\n{goal_text}"},
165+
{"type": "mrkdwn", "text": f"*Duration:*\n{duration}"}
166+
]
167+
},
168+
{"type": "divider"},
169+
{
170+
"type": "section",
171+
"text": {"type": "mrkdwn", "text": network_title},
172+
"fields": network_fields
173+
}
174+
]
175+
176+
# Add Button if Estela URL is available
177+
if job_url:
178+
blocks.append({
179+
"type": "actions",
180+
"elements": [
181+
{
182+
"type": "button",
183+
"text": {
184+
"type": "plain_text",
185+
"text": "🖥️ Ver Job en Estela",
186+
"emoji": True
187+
},
188+
"url": job_url,
189+
"style": "primary"
190+
}
191+
]
192+
})
193+
194+
# Add Footer
195+
blocks.append({
196+
"type": "context",
197+
"elements": [{"type": "mrkdwn", "text": f"Estela Job ID: {job_id} | Production Monitoring"}]
198+
})
199+
200+
payload = {
201+
"attachments": [
202+
{
203+
"color": "#D32F2F",
204+
"fallback": f"Alert: {spider.name}",
205+
"blocks": blocks
206+
}
207+
]
208+
}
209+
210+
try:
211+
requests.post(self.webhook_url, json=payload, timeout=10)
212+
except Exception as e:
213+
spider.logger.error(f"Failed to send Slack alert: {e}")
214+
else:
215+
spider.logger.info("Health Check: OK. No Slack alert triggered.")

src/ps_helper/scripts/generate_report.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ def _generate_retry_reasons_html(data):
327327
values=df_errors["Count"],
328328
marker=dict(
329329
colors=[
330-
"#FF5733", # Naranja rojo (original)
330+
"#FF5733", # Naranja rojo
331331
"#FF6B3D", # Naranja brillante
332332
"#FF8047", # Naranja medio
333333
"#FF9551", # Naranja claro
@@ -336,7 +336,7 @@ def _generate_retry_reasons_html(data):
336336
"#A8C560", # Lima
337337
"#7CB862", # Verde lima
338338
"#50AA64", # Verde medio
339-
"#00BF71" # Verde esmeralda (original)
339+
"#00BF71" # Verde esmeralda
340340
][
341341
: len(df_errors)
342342
]

0 commit comments

Comments
 (0)