1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup import time import requests import os
class DynCrawler: def __init__(self,url,theme): self.url = url self.theme = theme self.options=webdriver.ChromeOptions() self.options.add_argument('--headless') self.driver = webdriver.Chrome(service=ChromeService(executable_path=ChromeDriverManager().install()),options=self.options) self.href_list=[]
def get_whole_page(self): html_page=self.driver.find_elements(By.XPATH,'//*[@class="list-t"]/a') for page in html_page: url =page.get_attribute('href') title=page.text data = {"url":str(url),"title":str(title)} self.href_list.append(data) print("get page: ",url)
def get_images(self,url,title): self.driver.get(url) time.sleep(0.5) images=self.driver.find_elements(By.XPATH,'//div[@class="content"]/a/img') for index,image in enumerate(images): image_url=image.get_attribute('src') print(image_url) if image_url: image_name=index+1 image_name=str(image_name)+'.jpg' image_path=f'./{self.theme}/{title}/{image_name}' with open(image_path,'wb') as f: try: f.write(requests.get(image_url).content) except: pass
def get_title(self,url): print("get title: ",url) self.driver.get(url) time.sleep(0.5) title=self.driver.find_element(By.XPATH,'//*[@id="app"]/div/div[3]/div[1]/div[1]/h1') title=title.text print("title: ",title) return title
def run(self): if not os.path.exists(self.theme): os.mkdir(self.theme) self.driver.get(self.url) self.driver.implicitly_wait(2) self.get_whole_page() for data in self.href_list: url=data['url'] title=data['title'] title=title.replace('/','-') title=title.replace('|','-') title=title.replace(' ','-') title=title.replace(':','-') if not os.path.exists(f'./{self.theme}/{title}'): os.mkdir(f'./{self.theme}/{title}') with open(f'./{self.theme}/{title}/url.txt','w') as f: f.write(url) self.get_images(url,title)
if __name__ == '__main__': print('请输入歌手名字:') singername=input() for index in range(1,8): url=f'http://www.echangwang.com/singer/{singername}_{index}.html' crawler=DynCrawler(url,singername) crawler.run()
|