python3/flora1.py at main · nogizakapython/python3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# 企業概要取得ツール
# 新規開発   2024/3/2
# Create by 乃木坂好きのITエンジニア

#ライブラリのインポート
from bs4 import BeautifulSoup
import urllib3
import codecs
import datetime
import re
import os
import requests

#会社概要ページのURLの定義
get_url = "https://flora-inc.co.jp/page/company.php"
# 現在日時の取得
dt = datetime.datetime.now()
#現在日時を年4桁、月2桁、日付2桁、時間、分、秒のフォーマットで取得する
now_date = dt.strftime('%Y%m%d%H%M%S')
# 格納先ファイル名の定義
file_name = "flora" + now_date + ".txt"
#取得したHTMLから、必要なデータを抽出し、抽出ファイルに書き込む
result_file = "flora.txt"
# 検索文字列
pattern1 = '<td class="title">'
pattern2 = '<td class="content">'
pattern3 = '<p>'


# スクレイピング関数
def scraping():
    http = urllib3.PoolManager()
    # スクレイピング対象の URL にリクエストを送り HTML を取得する
    r = requests.get(get_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    # tableタグ内の文字列を取得する
    title_text = soup.find('table')
    print(title_text,file=codecs.open(file_name,'a','utf-8'))    # セクションタグを取得する

# メイン関数
def main():
    scraping()
    file_data = open(file_name,"r",encoding="utf-8")

    file_exist = os.path.isfile(result_file)
    if file_exist:
        os.remove(result_file)
    # テーブルタグを取得したタグファイルから、不必要tableタグデータ行を読み込んだ時にファイル出力
    # ループから抜ける。
    for line in file_data:
        line = line.replace("　"," ")
        # print(line)
        result1 = re.match(pattern1,line)
        result2 = re.match(pattern2,line)
        result3 = re.match(pattern3,line)
        # 必要なデータをファイルに出力する
        with open(result_file,mode="a",encoding="utf-8") as f:
            if result1 or result2 or result3:
                f.write(line)
            f.close()


    # 出力ファイルを閉じる
    file_data.close()

if __name__ == "__main__":
    main()