defGet_Url(target,start,ends): urls=[] for i inrange(start,ends): url = target+"/"+str(i) urls.append(url) return urls
if __name__ == "__main__": url = Get_Url("https://www.mzitu.com/214261",1,10) print(url)
request库的使用:
import re import requests
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
if __name__ == "__main__": ret = requests.get(url="https://www.mzitu.com/214261", headers=head, timeout=1) all_pic_link = re.findall('<img src="(.*?)"', ret.text, re.S) print(all_pic_link)
简单实现爬取图片:
import re import urllib.request
defopen_url(url): ret = urllib.request.Request(url) ret.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36') page = urllib.request.urlopen(ret) html =page.read().decode("utf-8") return html
defget_img(html): ret = re.findall('<img src="([^"]+\.jpg)"',html) for each in ret: filename = each.split("/")[-1] print("完整路径:",each) print("文件名称:",filename) urllib.request.urlretrieve(each,filename,None)
if __name__ == '__main__': url = open_url("https://www.mzitu.com/210402") get_img(url)
爬每日CVE漏洞列表:
import re import requests from bs4 import BeautifulSoup
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
defGet_CVE(url): new_cve = [] ret = requests.get(url=url, headers=head, timeout=3) bs = BeautifulSoup(ret.text, 'html.parser') for i in bs.find_all('a'): href = i.get('href') new_cve.append(href) return(new_cve)
defGet_Number(list): new = [] for i inlist: temp = re.findall("[0-9]{1,}-.*", str(i)) new.append("CVE-{}".format(temp)) return new
if __name__ == "__main__": url= "https://cassandra.cerias.purdue.edu/CVE_changes/today.html" cve = Get_CVE(url) number = Get_Number(cve) for i in number: print("今日份的漏洞:",i)
简单爬取西刺代理地址: 此处我们就用简单的正则匹配爬取,该方法比较笨拙.
import re import requests
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="https://www.xicidaili.com/nn/1", headers=head, timeout=3) data = re.findall('<td>.*</td>', ret.text) sum =0 for i inrange(0,20): IP = data[sum].replace("<td>","").replace("</td>","") Port = data[sum+1].replace("<td>","").replace("</td>","") Type = data[sum+2].replace("<td>","").replace("</td>","") times = data[sum+3].replace("<td>","").replace("</td>","") year = data[sum+4].replace("<td>","").replace("</td>","") print("IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}".format(IP,Port,Type,times,year)) sum = sum+5
from bs4 import BeautifulSoup import requests import html5lib
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="https://gz.centanet.com/ershoufang/", headers=head, timeout=3) text = str(ret.content.decode('utf-8'))
bs = BeautifulSoup(text,"html5lib") ret = bs.select('div[class="section"] div[class="house-item clearfix"] p[class="house-name"]') for i in ret: #house = i.get_text() # 提取出文中的所有字符串以及其格式 house = list(i.stripped_strings) # 提取出字符串并以列表的形式返回 print(house)
实现爬取中国天气网:
from bs4 import BeautifulSoup import requests import html5lib
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="http://www.weather.com.cn/textFC/shandong.shtml", headers=head, timeout=3) text = str(ret.content.decode('utf-8'))
bs = BeautifulSoup(text,"html5lib") bs.find_all('div',class_='conMidtab')[1] # 定位到第一个标签上 tr = bs.find_all('tr')[2:] # 在conMidtab里面找,tr标签并从第3个标签开始保存 for i in tr: td = i.find_all('td') # 循环找代码中的所有td标签 city_td = td[0] # 找所有的td标签,并找出第一个td标签 # stripped_strings 获取目标路径下所有的子孙非标签字符串,自动去掉空字符串 city = list(city_td.stripped_strings)[0] temp = td[-5] # 取出度数的标签 temperature = list(temp.stripped_strings)[0] print('城市:{} 温度:{}'.format(city,temperature))
使用bs4库爬取西刺代理: 使用库的方式爬取,啪啪啪,三下五除二搞定.
import re import requests from bs4 import BeautifulSoup
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="https://www.xicidaili.com/wt/", headers=head, timeout=3) bs = BeautifulSoup(ret.text,"lxml") ret = bs.select('table[id="ip_list"] tr[class="odd"]')
ip=[] for i in ret: house =list(i.stripped_strings) ip.append(house)
for i inrange(0,50): format = "http://{}:{}".format(ip[i][0],ip[i][1]) print(format,file=open("save.log",'a+',encoding='utf-8')) print("代理地址(已保存) {}".format(format))
Request使用代理IP地址
import re from time import sleep import requests
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} proxy = { "http":"http://127.0.0.1:9999" } # 无密码写法:"http": "http://ip:端口号" # 有密码写法:"https": "https://username:password@ip:端口号"
file = open("save.log","r",encoding="utf-8") for i in file.readlines(): data = "".join(i.split('\n')) # 去除空格 proxy.update(http=data) # 更新proxy中的数据为当前行 ret = requests.get(url="https://www.cnblogs.com/LyShark/", headers=head, timeout=3, proxies=proxy) if ret.status_code == 200: print("代理:{} 访问完成".format(proxy["http"])) else: print("代理:{} 不在线,失败".format(proxy["http"])) sleep(1)
Request代理下载文件
import requests
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} proxy = { "http":"http://117.69.200.46:9999" }
url = "https://nmap.org/dist/nmap-7.80-win32.zip"
ret = requests.get(url=url, headers=head,stream=True,proxies=proxy) fp = open("nmap.zip","wb")
for chunk in ret.iter_content(chunk_size=4096): if chunk: print("本次保存长度:{} ".format(len(chunk))) fp.write(chunk)
简单爬取子域名
import requests import json
defGetSubDomain(domain): url = "http://ce.baidu.com/index/getRelatedSites?site_address={}".format(domain) ret = requests.get(url=url) obj = json.loads(ret.text) list = obj.get("data") print("子域名个数:{}".format(len(list)))
fp = open("domain.log","w") for item inlist: fp.write(item.get("domain")) fp.write("\n") print(item) fp.close()
GetSubDomain("qq.com")
博客园自动备份工具: 自动备份博客园工具,快速备份博客文章包括图片等.
from bs4 import BeautifulSoup import requests,os
header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"}
defget_url(name,start_page,end_page): title = [] value = [] for x inrange(start_page,end_page+1): url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x) response = requests.get(url,headers=header,timeout=5) text = str(response.content.decode("utf-8")) bs = BeautifulSoup(text,"lxml") ret = bs.select('div[class="day"] div[class="postTitle"] a') for item inrange(0,10): x = ret[item].get_text().replace("\n","") y = ret[item].get('href').replace("\n","") title.append(x) value.append(y) print("[+] 文章路径: ---> 地址: {} ---> 标题: {}".format(y,x)) return title,value
from selenium import webdriver from bs4 import BeautifulSoup import requests,os,time,lxml import win32api,win32con
header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"}
defget_url(name,start_page,end_page): value = [] for x inrange(start_page,end_page+1): url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x) response = requests.get(url,headers=header,timeout=5) text = str(response.content.decode("utf-8")) bs = BeautifulSoup(text,"lxml") ret = bs.select('div[class="day"] div[class="postTitle"] a') for item inrange(0,10): y = ret[item].get('href').replace("\n","") value.append(y) print("[+] 爬行地址: {} ".format(y)) return value
if __name__ == "__main__": value = get_url("csnd",1,2) WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe" driver = webdriver.Chrome(executable_path=WebPath) driver.set_window_size(1024,768)