您的位置: 网站首页> SEO工具> 当前文章

python多线程百度PC指定域名查询关键词排名

老董-我爱我家房产SEO2020-08-24175围观,102赞

  查询关键词排名有两种,第一种是指定url和关键词查看是否有排名,第二种是提供关键词查看某个域名下哪个url有排名,这里提供第二种方式的脚本。其中的注意事项如下:

  1、准备关键词文件kwd.txt(一行一个)

  2、指定待查询域名domain

  3、域名不要带https或者http

  4、如果一个词某个域名下有2个url出现排名 取第一个

  5、线程数默认是1,现在百度反爬比之前严重!线程最好是1。【多线程写同一个文件需要加锁否则可能数据错乱】

  1. # ‐*‐ coding: utf‐8 ‐*‐
  2. """
  3. 查首页自然排名(即含快照)前10名
  4. 关键词文件kwd.txt,一行一个,指定待查询domain
  5. domain不要带https或者http
  6. 结果保存:关键词 对应排名的url 及排名值
  7. 某域名下一个词有2个url排名,取第一个
  8.  
  9. """
  10.  
  11. import requests
  12. from pyquery import PyQuery as pq
  13. import threading
  14. import queue
  15. import time
  16. import gc
  17.  
  18.  
  19. class BdpcRank2(threading.Thread):
  20.  
  21. def __init__(self):
  22. threading.Thread.__init__(self)
  23.  
  24. # 读取txt文件 获取待查询url
  25. @staticmethod
  26. def read_txt(filepath):
  27. q = queue.Queue()
  28. for line in open(filepath, encoding='utf-8'):
  29. kwd_url = line.strip()
  30. q.put(kwd_url)
  31. return q
  32.  
  33. # 获取某词的serp源码
  34. def get_html(self,url,retry=2):
  35. try:
  36. r = requests.get(url=url,headers=my_header,timeout=5)
  37. except Exception as e:
  38. print('获取源码失败',e)
  39. time.sleep(6)
  40. if retry > 0:
  41. self.get_html(url,retry-1)
  42. else:
  43. html = r.content.decode('utf-8',errors='ignore') # 用r.text有时候识别错误
  44. url = r.url # 反爬会重定向,取定向后的地址
  45. return html,url
  46.  
  47. # 获取某词的serp源码上自然排名的所有url
  48. def get_encrpt_urls(self, html,url):
  49. encrypt_url_list_rank = []
  50. doc = pq(html)
  51. title = doc('title').text()
  52. if '_百度搜索' in title and 'https://www.baidu.com/s?tn=48020221' in url:
  53. div_list = doc('.result').items() # 自然排名/有快照
  54. # div_op_list = doc('.result-op').items() # 非自然排名
  55. for div in div_list:
  56. rank = div.attr('id')
  57. if rank:
  58. try:
  59. a_list = div('.t a').items()
  60. except Exception as e:
  61. print('未提取自然排名加密链接')
  62. else:
  63. for a in a_list:
  64. encrypt_url = a.attr('href')
  65. if encrypt_url.find('http://www.baidu.com/link?url=') == 0:
  66. encrypt_url_list_rank.append((encrypt_url,rank))
  67. else:
  68. print(title,'源码异常,可能反爬')
  69. time.sleep(100)
  70. return encrypt_url_list_rank
  71.  
  72. # 解密某条加密url
  73. def decrypt_url(self, encrypt_url, retry=1):
  74. real_url = 'xxxx'
  75. try:
  76. encrypt_url = encrypt_url.replace('http://', 'https://')
  77. r = requests.head(encrypt_url, headers=my_header)
  78. except Exception as e:
  79. print(encrypt_url, '解密失败', e)
  80. if retry > 0:
  81. self.decrypt_url(encrypt_url, retry - 1)
  82. else:
  83. real_url = r.headers['Location']
  84. return real_url
  85.  
  86. # 格式化成字典,键为url值为排名
  87. def make_dict(self,encrypt_urls_ranks):
  88. rank_dict = {}
  89. for encrypt_url_rank in encrypt_urls_ranks:
  90. encrypt_url,rank= encrypt_url_rank
  91. real_url = self.decrypt_url(encrypt_url)
  92. rank_dict[real_url] = rank
  93. return rank_dict
  94.  
  95. # 线程函数
  96. def run(self):
  97. while 1:
  98. kwd = q.get()
  99. url = "https://www.baidu.com/s?tn=48020221_28_hao_pg&ie=utf-8&wd={0}".format(kwd)
  100. try:
  101. html,now_url = self.get_html(url)
  102. encrypt_url_list_rank = self.get_encrpt_urls(html,now_url)
  103. except Exception as e:
  104. print(e)
  105. else:
  106. if encrypt_url_list_rank:
  107. rank_dict = self.make_dict(encrypt_url_list_rank)
  108. url_keys = list(rank_dict.keys())
  109. if domain not in ''.join(url_keys):
  110. f.write('{0} {1} {2}
  111. '.format(kwd,domain,'无'))
  112. print(kwd,'无')
  113. for real_url in url_keys:
  114. if domain in real_url:
  115. rank = rank_dict[real_url]
  116. f.write(kwd+' '+real_url+' '+str(rank)+'
  117. ')
  118. print(kwd,real_url,rank)
  119. f.flush()
  120. finally:
  121. del kwd
  122. gc.collect()
  123. q.task_done()
  124.  
  125.  
  126. if __name__ == "__main__":
  127. domain = 'www.renrenche.com'
  128. my_header = {
  129. 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
  130. 'Cookie':'BIDUPSID=EB1F44AB7896D7EFA4F0FD243C29FF17; PSTM=1567562976; BAIDUID=EB1F44AB7896D7EFA4F0FD243C29FF17:SL=0:NR=10:FG=1; BDUSS=BZWlZuSXpNWmNjM3BTSktnM2xhbGhIdUlqeW1ITEdvclpzSHpIS3p2WUMwc2hkRVFBQUFBJCQAAAAAAAAAAAEAAAAGtiZkNzcyNDgzMjAwZG9uZwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJFoV0CRaFdeF; plus_cv=1::m:49a3f4a6; MSA_WH=400_655; lsv=globalTjs_3a11c3d-globalT_androidcss_4630b37-wwwT_androidcss_c5f9a54-searchboxcss_591d86b-globalBcss_aad48cc-wwwBcss_777000e-framejs_c9ac861-atomentryjs_5cd4b30-globalBjs_99ad350-wwwjs_b674808; BD_UPN=19314353; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BDICON=10294984.98; delPer=0; BD_CK_SAM=1; rsv_i=c2b6G%2F3avQC%2FfgLjK6Tg5dByzXJGjTHszykjx0XgYlZZgizi3%2F9wOVrzCucTWKLxPYYUs%2BqPpygizpeQMUWhVScLKRxzaaw; FEED_SIDS=732051_1030_14; plus_lsv=f197ee21ffd230fd; Hm_lvt_12423ecbc0e2ca965d84259063d35238=1572225355,1572415847,1572418912; Hm_lpvt_12423ecbc0e2ca965d84259063d35238=1572418912; BAIDULOC=12966109.384666294_4841881.341700486_100_131_1572418911981; SE_LAUNCH=5%3A26206981_0%3A26206981; BDPASSGATE=IlPT2AEptyoA_yiU4VKH3kIN8efjWvW4AfvESkplQFStfCaWmhH3BrUzWz0HSieXBDP6wZTXdMsDxXTqXlVXa_EqnBsZolpOaSaXzKGoucHtVM69-t5yILXoHUE2sA8PbRhL-3MEF2ZELlQvcgjchQZrchW8z3JTpxz1z5Xocc0T1UKR2VLJxJyTS7xvRHvcPNuz94rXnEpKKSmBUADHRVjYcSQyWXkD5NOtjsAm1Q0WrkoXGurSRvAa1G8vJpFeXAio1fWU60ul269v5HViViwh9UOI7u46MnJZ; H_WISE_SIDS=137151_137734_137755_136649_137663_137071_128070_134982_136665_120196_136768_137002_137788_136366_132909_136456_137690_135847_131246_137746_132378_136681_118893_118876_118846_118827_118802_132782_136800_136431_136093_133352_136862_137089_129652_136194_124637_137105_137572_133847_132551_137468_134046_129646_131423_137212_137466_136034_110085_127969_137613_131951_136611_137252_128196_137696_136636_137767_137207_134347_134231_137618_137449; kleck=638cabc3ad33a7a082343c4553a47c42; BDRCVFR[x4e6higC8W6]=mk3SLVN4HKm; PSINO=7; H_PS_PSSID=1440_21084_20697_29567_29220; sug=3; sugstore=0; ORIGIN=0; bdime=0; H_PS_645EC=db34IWhem1lYO7OwXVBPbsx2yQuIu3jmqGT9FUp09TItjsTj8omDTLnov6%2BIZQe6dqc',
  131. 'Host':'www.baidu.com',
  132. 'Upgrade-Insecure-Requests':'1'}
  133. q = BdpcRank2.read_txt('kwd.txt')
  134. f = open('bdpc_rank2.txt','w',encoding='utf-8')
  135. # 设置线程数
  136. for i in list(range(1)):
  137. t = BdpcRank2()
  138. t.setDaemon(True)
  139. t.start()
  140. q.join()
  141. f.flush()
  142. f.close()
  143.  
  144.  
鞍山二手宝骏 https://www.renrenche.com/as/baojun/ 5
鞍山二手保时捷 
鞍山宝马二手车报价 https://www.renrenche.com/as/baoma_baoma3xi/ 10
鞍山宝骏二手车报价 https://www.renrenche.com/as/baojun/ 4
鞍山二手宝马 
鞍山保时捷二手车报价 
鞍山二手宝沃 
鞍山二手北京 
鞍山北京二手车报价 
鞍山宝沃二手车报价 
鞍山巴博斯二手车报价 
鞍山二手北汽幻速 
鞍山北汽幻速二手车报价 
鞍山二手北汽绅宝 
鞍山北汽绅宝二手车报价 https://www.renrenche.com/gz/beiqishenbao/jishou/ 10


  以上就是百度PC端指定域名查询一批关键词下该域名排名的Url,大家可以copy使用,有问题一起交流,爬虫没有永远有效的,只要对方网站一调整,脚本就得跟着变

很赞哦!

python编程网提示:转载请注明来源www.python66.com。
有宝贵意见可添加站长微信(底部),获取技术资料请到公众号(底部)。同行交流请加群 python学习会

文章评论

    python多线程百度PC指定域名查询关键词排名文章写得不错,值得赞赏

站点信息

  • 网站程序:Laravel
  • 客服微信:a772483200