Python爬取网站数据

Python爬取数据-爬取四川招标网站数据



前言

  日常需要查询某个网站的相关数据,当数据量大或需要定期去查询时,手工统计是一个繁琐重复的工作,这个时候可以借助python良好的爬虫相关包获取数据,下面就以爬取四川招标网站数据为例:。


一、准备

  爬虫相关用到的依赖包有:bs4、re、requests等,将爬取到到的数据需要存入excel文件中,所以需要用到xlsxwriter包。因此需要先安装这些依赖包,通互联网情况下可以直接执行pip install $packages进行安装.

二、使用步骤

1.引入库

代码如下(示例):

from xlsxwriter.workbook import Workbook
import requests
import re
from bs4 import BeautifulSoup

2.读入数据

代码如下(以四川招标网站为例):

# 查询网站列表
url = 'http://202.61.88.152:9004/gpms/SupplierShowController.do?method=toSupplierList&rp=2000&page=1&districtLevel=1'
headers = {
    'accept-encoding': 'gzip, deflate',
    'accept-language': 'zh-CN,zh;q=0.9',
    'Content-Type': 'application/x-www-form-urlencoded',
    'cookie': 'JSESSIONID=RmT-YOIk2m8DLqMoV09NZp0kw19uscWWLABjqOHNG_bDJQej2hsT!544585480; SRV=test3D',
    'referer': 'http://202.61.88.152:9004/gpms/SupplierShowController.do?method=toSupplierList&rp=20&page=1&districtLevel=1',
    'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
    'sec-ch-ua-mobile': '?0',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
  }
res = requests.get(url, headers=headers, verify=False)
numb = re.findall(r'common.goToOrgShop((.*?));return false;', res.text)
#print(type(numb))
temp = []
for num in numb:
    if "CGAL" in str(num):
        #print(num)
        continue
    temp.append(num[0].split("'")[1])
result = []  
result = list(set(temp))

# 获取每一条明细所需字段
def getDetail(orgInfoId):
    url = 'http://202.61.88.152:9004/gpms/OrgInfoController.do?method=toAuditOrgInfo_XYGH&orgInfoId=' + orgInfoId + '&auditRole=SUPPLIER&ishidden=true1'
    headers = {
        'accept-encoding': 'gzip, deflate',
        'accept-language': 'zh-CN,zh;q=0.9',
        'Content-Type': 'application/x-www-form-urlencoded',
        'cookie': 'JSESSIONID=RmT-YOIk2m8DLqMoV09NZp0kw19uscWWLABjqOHNG_bDJQej2hsT!544585480; SRV=test3D',
        'referer': 'http://202.61.88.152:9004/gpms/SupplierShowController.do?method=toSupplierList&rp=2000&page=1&districtLevel=1',
        'Content-Length': '12',
        'Connection': 'keep-alive',
        'Host': '202.61.88.152:9004',
        'Origin': 'http://202.61.88.152:9004',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
      }
    res = requests.get(url, headers=headers, verify=False)
    #print(res.content)
    #print(res.status_code)
    soup = BeautifulSoup(res.text,features='html.parser')
    #target = soup.find(class='fullLine')
    li_list = soup.find_all('li')
    #print(li_list)
    info = []
    try: 
        for sp in li_list:
            #print(sp.find('label'))
            if sp.find('label'):
                #print(sp.find('label'))
                if sp.find('label').text == "联系手机:":
                    #print(sp.find('span').text)
                    mobile = sp.find('span').text
                if sp.find('label').text == "详细通信地址:":
                    #print(sp.find('span').text)
                    address = sp.find('span').text
                if sp.find('label').text == "机构名称:":
                    #print(sp.find('span').text)
                    companyName = sp.find('span').text
                #if sp.find('label').text == "法定代表人:":
                #    print(sp.find('span').text)
                #    userName = sp.find('span').text
                #if sp.find('label').text == "联系手机:":
                #    print(sp.find('span').text)
        info.append(companyName)
        info.append(mobile)
        info.append(address)
        print(info)
    except:
        info.append(url)
        info.append("")
        info.append("")
    return info

# 将查询明细写入excel表
if __name__ == '__main__':
    workbook = Workbook('供应商信息.xlsx')
    worksheet = workbook.add_worksheet('result')
    worksheet.write(0, 0, "序号")
    worksheet.write(0, 1, "公司名称")
    worksheet.write(0, 2, "联系方式")
    worksheet.write(0, 3, "公司地址")
    for i in range(len(result)):
        print(result[i])
        idInfo = getDetail(result[i])
        # print(idInfo)
        worksheet.write(i + 1, 0, i)
        for j in range(len(idInfo)):
            worksheet.write(i + 1, j + 1, idInfo[j])

    workbook.close()

总结

  爬取网站数据思路:
  1、第一步,获取需要爬取的网站地址及页面展示规律;
  2、第二步、可采用正则或bs4方式进行匹配需要的关键信息;
  3、第三步、将爬取的数据写入到文件。