爬取新浪财经ESG评级指数

网络爬虫
Author

Tom

Published

January 13, 2023

新浪财经的 ESG 评级指数页面分页利用了 JS 进行动态渲染,因为数据量不大,就利用 Selenium 来爬取。

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException

import time
import logging
import pandas as pd

from tqdm import tqdm

我们利用 CSS 选择器来解析所需数据,然后模拟点击下一页,循环解析。

这里存在的一个问题是,点击下一页后,JS 渲染需要时间,页面可能还在加载,这时候 browser.find_elements() 会报错 StaleElementReferenceException。一个选择是在每次解析数据前做一个判断,判断所需数据是否已经加载出来了。而由于网站没有反爬措施,另一个选择是暴力循环,直到页面加载出来。

def parse_data(page):
    while True:
        try:
            # use CSS selectors to locate the data
            lft_list = [
                element.text for element in browser.find_elements(
                    By.CSS_SELECTOR, '#lft-name-list i'
                )
            ]
            esg_score = [
                element.text for element in browser.find_elements(
                    By.CSS_SELECTOR, 'div[data-key="esg_score"] li'
                )
            ]
            env_score = [
                element.text for element in browser.find_elements(
                    By.CSS_SELECTOR, 'div[data-key="env_score"] li'
                )
            ]
            social_score = [
                element.text for element in browser.find_elements(
                    By.CSS_SELECTOR, 'div[data-key="social_score"] li'
                )
            ]
            governance_score = [
                element.text for element in browser.find_elements(
                    By.CSS_SELECTOR, 'div[data-key="governance_score"] li'
                )
            ]

            data_dic = {
                    "lft_list" : lft_list,
                    "esg_score" : esg_score,
                    "env_score" : env_score,
                    "social_score" : social_score,
                    "governance_score" : governance_score,
            }
            
            if page < 510:
            # locate the next page button and click
                button = browser.find_element(By.CSS_SELECTOR, '.pages-rt')
                button.click()
            else:
                pass
        except Exception as e:
            # waiting 0.2s for page refresh if the Exception is StaleElementReferenceException,
            if isinstance(e, StaleElementReferenceException):
                time.sleep(0.2)
            else:
                logging.warning(e)
                break
        else:
            break
    
    return data_dic  
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s'
)

# basic settings for headless browser and anti-detection
option = ChromeOptions()
option.add_argument('--headless')
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)

browser = webdriver.Chrome(options=option)
browser.execute_cdp_cmd(
    'Page.addScriptToEvaluateOnNewDocument', {
    'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
{'identifier': '2'}
wait = WebDriverWait(browser, 5)
res = []

try:
    browser.get('https://finance.sina.com.cn/esg/grade.shtml')
    wait.until(
        EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.lft-bodylist'))
    )
except Exception as e:
    logging.warning(e)
    
for page in tqdm(range(1, 511)):
    data_dic = parse_data(page)
    res.append(data_dic)
else:
    browser.close()
    browser.quit()
2023-01-14 20:13:53,283 - WARNING: Message: 

100%|████████████████████████████████████████████████████████████████████████████████| 510/510 [14:00<00:00,  1.65s/it]
ESG = pd.DataFrame(
    columns=['lft_list', 'esg_score', 'env_score', 'social_score', 'governance_score']
)
for data_df in [pd.DataFrame(data_dic) for data_dic in res]:
    ESG = ESG.append(data_df)
ESG.head(5).append(ESG.tail(5))
lft_list esg_score env_score social_score governance_score
0 酒鬼酒 15.3(D) 3.4(D-) 10.3(D) 38.9(C)
1 耐斯系统 61.4(B) 38.9(C) 60.2(B) 69.3(B+)
2 SunCoke Energy 60.6(B) 68.9(B+) 56.6(B-) 55.1(B-)
3 普尔斯玛特 31.5(C-) 8.9(D) 40.8(C) 41.6(C)
4 新城发展 53.3(B-) 30.3(C-) 43.6(C+) 86.1(A)
6 华能国际电力股份 51.1(B-) 61.2(B) 41.1(C) 46.7(C+)
7 安徽皖通高速公路 44.7(C+) 51.2(B-) 41.1(C) 43.4(C+)
8 民生银行 47.4(C+) 37.0(C) 71.4(B+) 18.3(D+)
9 中远海能 57.1(B-) 61.1(B) 38.7(C) 78.8(A-)
0 中国石油化工股份 58.8(B) 69.9(B+) 50.3(B-) 57.9(B-)
ESG.to_csv('ESG.csv', index=False)