来源:python中国网 时间:2019-07-18

  需求:在百度搜索www.python66.com,然后将搜索结果保存到文件bd_python66.html

  百度搜索的url:https://www.baidu.com/s?wd=搜索词

  params参数进行url传参,代码如下:

# -*- coding: utf-8 -*-
import requests
import re

def get_html(url,key_value,retry=2):
    try:
        r = requests.get(url=url,headers=headers,params=key_value,timeout=5)
    except Exception as e:
        print(e)
        if retry > 0:
            get_html(url,retry-1)
    else:
        r.encoding = 'utf-8'
        page_text = r.text
        return page_text


if __name__ == "__main__":

    # 自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
    }
    url = 'https://www.baidu.com/s?' # 注意该url
    kw = {'wd':'www.python66.com'}
    html = get_html(url,kw)
    # 提取网页title
    title = re.search('<title>(.*?)</title>',html)
    print(title.group(1))


D:python3installpython.exe D:/python/py3script/test.py
www.python66.com_百度搜索

Process finished with exit code 0


  url重定向演示,Github 将所有的 HTTP 请求重定向到 HTTPS。案例代码如下:

# -*- coding: utf-8 -*-
import requests

def get_html(url,retry=2):
    try:
        r = requests.get(url=url,headers=headers,timeout=5)
    except Exception as e:
        print(e)
        if retry > 0:
            get_html(url,retry-1)
    else:
        print('重定向',r.history)
        print('重定向后的请求url',r.url)
        print(r.status_code)


if __name__ == "__main__":

    # 自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
    }
    url = 'http://github.com/'
    get_html(url)


D:python3installpython.exe D:/python/py3script/test.py
重定向 [<Response [301]>]
重定向后的请求url https://github.com/
200

Process finished with exit code 0


  通过 allow_redirects 参数禁用重定向处理:

# -*- coding: utf-8 -*-
import requests

def get_html(url,retry=2):
    try:
        r = requests.get(url=url,headers=headers,allow_redirects=False,timeout=5)
    except Exception as e:
        print(e)
        if retry > 0:
            get_html(url,retry-1)
    else:
        print('重定向',r.history)
        print('请求url',r.url)
        print(r.status_code)


if __name__ == "__main__":

    # 自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
    }
    url = 'http://github.com/'
    get_html(url)


D:python3installpython.exe D:/python/py3script/test.py
重定向 []
请求url http://github.com/
301

Process finished with exit code 0


  cookie自动登录:如果我们不登录人人网的是不能访问个人主页的,我们登录人人网后通过浏览器抓包找到cookie,然后把cookie加到自己构造的请求头里面,再访问个人主页的url一样可以获取到正常信息(推荐阅读:cookie是什么),代码如下:

# -*- coding: utf-8 -*-
import requests
import re

def get_html(url,retry=2):
    try:
        r = requests.get(url=url,headers=headers,timeout=5)
    except Exception as e:
        print(e)
        if retry > 0:
            get_html(url,retry-1)
    else:
        page_text = r.text
        return page_text


if __name__ == "__main__":

    # 自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Cookie': 'anonymid=jy80yf87nu48vb; depovince=BJ; jebecookies=b398e24f-6670-48af-a58e-e6fd6456bcd6|||||; _r01_=1; JSESSIONID=abcrmwfsBwR_ufXciZcWw; ick_login=8a9f224f-1671-41ed-ab25-74e0a42ac995; _de=96965DC06F71F402340E4CEC836F3769696BF75400CE19CC; p=cae405d7c6e785f089ca39606c9d88695; first_login_flag=1; ln_uact=614863843@qq.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn521/20101208/1350/h_main_sadA_14a1000031012f76.jpg; t=940ac4b9ace0423b80a81dee055637955; societyguester=940ac4b9ace0423b80a81dee055637955; id=347908095; xnsid=fd3328ae; ver=7.0; loginfrom=null; jebe_key=8cc12fc2-9a64-4553-85ea-671b395d345b%7Cef398f6216b3a86b3d29665bee53e231%7C1563415076620%7C1%7C1563415078837; jebe_key=8cc12fc2-9a64-4553-85ea-671b395d345b%7Cef398f6216b3a86b3d29665bee53e231%7C1563415076620%7C1%7C1563415078840; wp_fold=0',
    }
    url = 'http://www.renren.com/347908095/profile'
    html = get_html(url)
    title = re.search('<title>(.*?)</title>',html)
    print(title.group(1))


D:python3installpython.exe D:/python/py3script/test.py
人人网 - 老董

Process finished with exit code 0