import requests
import re
from bs4 import BeautifulSoup
import os
def mingzi(name): #编号补齐到2位数字 1-1 => 01-01
num2 = name.find('-') #一个数前补0
if int(name[:num2])<10: #一个数字
name = name[:num2].zfill(2) + name[num2:]
num2 += 1
if int(name[num2+1:num2+3])<10: #一个数字
name = name[:num2+1] + name[num2+1:num2+2].zfill(2) +' '+ name[num2+3:]
return name.replace(' ','')
def downfiles(url,name):
path = r"F:/2023"
if not os.path.exists(path):
os.mkdir(path)
filename = name + '.' + link['href'].split('.')[1]
full_name = path +'/'+ filename
response = requests.get(url, headers=headers)
with open(full_name,'wb') as f:
f.write(response.content)
url="http://tjj.shaanxi.gov.cn/upload/2023/zk/lefte.htm"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
html = response.text
soup=BeautifulSoup(html,'html.parser')
for link in soup.find_all('a'):
name = link.string
if name == None or not(re.match("\d{1,2}-",name)): #None类型,执行下一次循环
continue
url = 'http://tjj.shaanxi.gov.cn/upload/2023/zk/' + link['href']
name = mingzi(name) #编号补齐到2位数字
downfiles(url,name)
陕西省统计局网站的统计年鉴,提供了xls文件:http://tjj.shaanxi.gov.cn/tjsj/ndsj/tjnj/sxtjnj/index.html?2023昨天晚上网上查询爬虫的方法,又求助了论坛各位大佬,收获颇多:
1. 爬取的网页地址要看frame框架里src="left.htm"的地址(document中的),而不是直接的链接。
2.了解了href的处理