DjangoFrameWork/pythonSpiderAboutArticle.py at master · PythonBian/DjangoFrameWork · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
使用之前请按照 requests和lxml模块
"""

import pymysql
import requests
from lxml import etree
from time import sleep


def get_data(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    }

    response = requests.get(url,headers = headers)
    content = response.content.decode()

    html = etree.HTML(content)
    title, = html.xpath('//div[@class="title"]/h1/text()')
    date = html.xpath('//div[@class="info"]/text()')[0].split(" ")[0]
    author, = html.xpath('//div[@class="info"]/a/text()')
    content = "\n".join(html.xpath('//div[@class="content "]/p/text()')[1:])
    print(title)
    if not title:
        title = "位命名"
    if not  date:
        date = "1970-01-01"
    if not  author:
        author = 1
    if not content:
        content = "empty"
    description = content[:20]+"......"
    return title,description,date,content,"images/01.jpg",1

def saveData(datas):
    connect = pymysql.connect(
        host = "localhost",
        user = "root",
        password = "111111",
        database = "articleblog"
    )

    cursor = connect.cursor()
    sql = """INSERT INTO article_article (title, description, public_time, content, picture, article_author_id )
    VALUES
    ('%s','%s','%s','%s','%s','%s')"""%datas

    cursor.execute(sql)

    connect.commit()
    cursor.close()
    connect.close()
    print("save is ok")

def get_page():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    }
    for i in range(10):
        url = "https://www.jj59.com/gushi/zheligushi/list_127_%s.html"%i
        response = requests.get(url, headers=headers)
        content = response.content.decode()
        html = etree.HTML(content)
        href = html.xpath('//li[@class="bd"]/h3/a/@href')
        for h in href:
            article_url = "https://www.jj59.com"+h
            datas = get_data(article_url)
            saveData(datas)
        sleep(1)
if __name__ == "__main__":
    get_page()
    # url = "https://www.jj59.com/jjart/428923.html"
    # datas = get_dhata(url)
    # saveData(datas)