Python 爬虫：下载网页的统计数据- 编程源码 -卓越飞翔博客

import requests
import re
from bs4 import BeautifulSoup
import os
  
def mingzi(name): #编号补齐到2位数字 1-1 => 01-01
    num2 = name.find('-')  #一个数前补0
    if int(name[:num2])<10: #一个数字
        name = name[:num2].zfill(2) + name[num2:]
        num2 += 1
    if int(name[num2+1:num2+3])<10: #一个数字
        name = name[:num2+1] + name[num2+1:num2+2].zfill(2) +' '+ name[num2+3:]
    return name.replace(' ','')
 
def downfiles(url,name):
    path = r"F:/2023"
    if not os.path.exists(path):
        os.mkdir(path)
    filename = name + '.' + link['href'].split('.')[1]
    full_name = path +'/'+ filename
    response = requests.get(url, headers=headers)
    with open(full_name,'wb') as f:
        f.write(response.content)
 
url="http://tjj.shaanxi.gov.cn/upload/2023/zk/lefte.htm"
  
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
  
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
html = response.text
soup=BeautifulSoup(html,'html.parser')
 
for link in soup.find_all('a'): 
    name = link.string
    if name == None or not(re.match("\d{1,2}-",name)): #None类型，执行下一次循环
        continue
    url = 'http://tjj.shaanxi.gov.cn/upload/2023/zk/' + link['href']
    name = mingzi(name)  #编号补齐到2位数字
    downfiles(url,name)

陕西省统计局网站的统计年鉴，提供了xls文件：http://tjj.shaanxi.gov.cn/tjsj/ndsj/tjnj/sxtjnj/index.html?2023
昨天晚上网上查询爬虫的方法，又求助了论坛各位大佬，收获颇多：
1. 爬取的网页地址要看frame框架里src="left.htm"的地址（document中的），而不是直接的链接。
2.了解了href的处理

相关推荐