crawler-sample

code-test

这是一个用来测试代码显示效果的页面。

你也可以用它爬点东西

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
# from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time
import requests
import os

class DynCrawler:
def __init__(self,url,theme):
self.url = url
self.theme = theme
self.options=webdriver.ChromeOptions() # 无头模式
self.options.add_argument('--headless')
self.driver = webdriver.Chrome(service=ChromeService(executable_path=ChromeDriverManager().install()),options=self.options)
self.href_list=[]


def get_whole_page(self):
html_page=self.driver.find_elements(By.XPATH,'//*[@class="list-t"]/a')
for page in html_page:
url =page.get_attribute('href')
title=page.text
data = {"url":str(url),"title":str(title)}
self.href_list.append(data)
print("get page: ",url)

def get_images(self,url,title):
self.driver.get(url)
time.sleep(0.5)
images=self.driver.find_elements(By.XPATH,'//div[@class="content"]/a/img')
for index,image in enumerate(images):
image_url=image.get_attribute('src')
print(image_url)
if image_url:
image_name=index+1
image_name=str(image_name)+'.jpg'
image_path=f'./{self.theme}/{title}/{image_name}'
with open(image_path,'wb') as f:
try:
f.write(requests.get(image_url).content)
except:
pass

def get_title(self,url):
print("get title: ",url)
self.driver.get(url)
time.sleep(0.5)
title=self.driver.find_element(By.XPATH,'//*[@id="app"]/div/div[3]/div[1]/div[1]/h1')
title=title.text
print("title: ",title)
return title

def run(self):
if not os.path.exists(self.theme):
os.mkdir(self.theme)
self.driver.get(self.url)
self.driver.implicitly_wait(2)
self.get_whole_page()
for data in self.href_list:
url=data['url']
title=data['title']
title=title.replace('/','-')
title=title.replace('|','-')
title=title.replace(' ','-')
title=title.replace(':','-')
# make dir
if not os.path.exists(f'./{self.theme}/{title}'):
os.mkdir(f'./{self.theme}/{title}')
with open(f'./{self.theme}/{title}/url.txt','w') as f:
f.write(url)
self.get_images(url,title)

if __name__ == '__main__':
print('请输入歌手名字:')
singername=input()
for index in range(1,8):
url=f'http://www.echangwang.com/singer/{singername}_{index}.html'
crawler=DynCrawler(url,singername)
crawler.run()



本文总阅读量