from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
import time
import logging
import pandas as pd
from tqdm import tqdm
新浪财经的 ESG 评级指数页面分页利用了 JS 进行动态渲染,因为数据量不大,就利用 Selenium 来爬取。
我们利用 CSS 选择器来解析所需数据,然后模拟点击下一页,循环解析。
这里存在的一个问题是,点击下一页后,JS 渲染需要时间,页面可能还在加载,这时候 browser.find_elements()
会报错 StaleElementReferenceException
。一个选择是在每次解析数据前做一个判断,判断所需数据是否已经加载出来了。而由于网站没有反爬措施,另一个选择是暴力循环,直到页面加载出来。
def parse_data(page):
while True:
try:
# use CSS selectors to locate the data
= [
lft_list for element in browser.find_elements(
element.text '#lft-name-list i'
By.CSS_SELECTOR,
)
]= [
esg_score for element in browser.find_elements(
element.text 'div[data-key="esg_score"] li'
By.CSS_SELECTOR,
)
]= [
env_score for element in browser.find_elements(
element.text 'div[data-key="env_score"] li'
By.CSS_SELECTOR,
)
]= [
social_score for element in browser.find_elements(
element.text 'div[data-key="social_score"] li'
By.CSS_SELECTOR,
)
]= [
governance_score for element in browser.find_elements(
element.text 'div[data-key="governance_score"] li'
By.CSS_SELECTOR,
)
]
= {
data_dic "lft_list" : lft_list,
"esg_score" : esg_score,
"env_score" : env_score,
"social_score" : social_score,
"governance_score" : governance_score,
}
if page < 510:
# locate the next page button and click
= browser.find_element(By.CSS_SELECTOR, '.pages-rt')
button
button.click()else:
pass
except Exception as e:
# waiting 0.2s for page refresh if the Exception is StaleElementReferenceException,
if isinstance(e, StaleElementReferenceException):
0.2)
time.sleep(else:
logging.warning(e)break
else:
break
return data_dic
logging.basicConfig(=logging.INFO,
levelformat='%(asctime)s - %(levelname)s: %(message)s'
)
# basic settings for headless browser and anti-detection
= ChromeOptions()
option '--headless')
option.add_argument('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option(
= webdriver.Chrome(options=option)
browser
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
{'identifier': '2'}
= WebDriverWait(browser, 5)
wait = []
res
try:
'https://finance.sina.com.cn/esg/grade.shtml')
browser.get(
wait.until('.lft-bodylist'))
EC.visibility_of_all_elements_located((By.CSS_SELECTOR,
)except Exception as e:
logging.warning(e)
for page in tqdm(range(1, 511)):
= parse_data(page)
data_dic
res.append(data_dic)else:
browser.close() browser.quit()
2023-01-14 20:13:53,283 - WARNING: Message:
100%|████████████████████████████████████████████████████████████████████████████████| 510/510 [14:00<00:00, 1.65s/it]
= pd.DataFrame(
ESG =['lft_list', 'esg_score', 'env_score', 'social_score', 'governance_score']
columns
)for data_df in [pd.DataFrame(data_dic) for data_dic in res]:
= ESG.append(data_df) ESG
5).append(ESG.tail(5)) ESG.head(
lft_list | esg_score | env_score | social_score | governance_score | |
---|---|---|---|---|---|
0 | 酒鬼酒 | 15.3(D) | 3.4(D-) | 10.3(D) | 38.9(C) |
1 | 耐斯系统 | 61.4(B) | 38.9(C) | 60.2(B) | 69.3(B+) |
2 | SunCoke Energy | 60.6(B) | 68.9(B+) | 56.6(B-) | 55.1(B-) |
3 | 普尔斯玛特 | 31.5(C-) | 8.9(D) | 40.8(C) | 41.6(C) |
4 | 新城发展 | 53.3(B-) | 30.3(C-) | 43.6(C+) | 86.1(A) |
6 | 华能国际电力股份 | 51.1(B-) | 61.2(B) | 41.1(C) | 46.7(C+) |
7 | 安徽皖通高速公路 | 44.7(C+) | 51.2(B-) | 41.1(C) | 43.4(C+) |
8 | 民生银行 | 47.4(C+) | 37.0(C) | 71.4(B+) | 18.3(D+) |
9 | 中远海能 | 57.1(B-) | 61.1(B) | 38.7(C) | 78.8(A-) |
0 | 中国石油化工股份 | 58.8(B) | 69.9(B+) | 50.3(B-) | 57.9(B-) |
'ESG.csv', index=False) ESG.to_csv(