在SEO人员那里,判断一个行业哪些站做的好,有一个大众比较认可的思路。找一批行业词,查询每个词百度排名前10的url,然后提取下来,最后统计下哪个域名出现次数多。出现次数多的几个域名就是这个行业的优质站点。
具体到搜索引擎搜索展现的实际情况,其实有2种方式,
1)查询一批词,可以统计某域名出现次数,然后最终做计算。(一个关键词同一个域名出现N个url排名计算N次)
2)查询一批词,也可以只计算这批词排在首页词数来筛选域名。(一个关键词同一个域名出现N个url排名计算1次,相当于计算首页词数)
本文按照第1种方式来计算,准备kwd.txt,一行一个关键词。
选取少部分词测试下,实际工作为了科学准确需要选大量行业词,代码如下。
# ‐*‐ coding: utf‐8 ‐*‐ """ 一个关键词serp上同一个域名出现N个url排名 计算N次 """ import requests from pyquery import PyQuery as pq import threading import queue import time from urllib.parse import urlparse import gc class bdpcCover(threading.Thread): def __init__(self): threading.Thread.__init__(self) # 读取txt文件 关键词进入队列 @staticmethod def read_file(filepath): q = queue.Queue() for kwd in open(filepath,encoding='utf-8'): kwd = kwd.strip() q.put(kwd) return q # 获取某词serp源码 def get_html(self,url,retry=2): try: r = requests.get(url=url,headers=user_agent,timeout=5) except Exception as e: print('获取源码失败',e) if retry > 0: self.get_html(url,retry-1) else: html = r.content.decode('utf-8') # 直接 r.text有时识别错误 return html # 获取某词serp源码上自然排名的所有url def get_encrpt_urls(self,html): encrypt_url_list = [] if html and '_百度搜索' in html: doc = pq(html) try: a_list = doc('.t a').items() except Exception as e: print('未提取到serp上的解密url', e, url) else: for a in a_list: encrypt_url = a.attr('href') if encrypt_url.find('http://www.baidu.com/link?url=') == 0: encrypt_url_list.append(encrypt_url) return encrypt_url_list # 解密某条加密url def decrypt_url(self,encrypt_url,retry=1): try: encrypt_url = encrypt_url.replace('http://','https://') r = requests.head(encrypt_url,headers=user_agent) except Exception as e: print(encrypt_url,'解密失败',e) if retry > 0: self.decrypt_url(encrypt_url,retry-1) else: return r.headers['Location'] # 获取某词serp源码首页排名真实url def get_real_urls(self,encrypt_url_list): real_url_list = [self.decrypt_url(encrypt_url) for encrypt_url in encrypt_url_list] return real_url_list # 提取某条url域名部分 def get_domain(self,real_url): try: res = urlparse(real_url) except Exception as e: print (e,real_url) domain = "xxx" else: domain = res.netloc return domain # 获取某词serp源码首页排名真实url的域名部分 def get_domains(self,real_url_list): domain_list = [self.get_domain(real_url) for real_url in real_url_list] return domain_list # 线程函数 def run(self): global success_num while 1: kwd = q.get() try: url = "https://www.baidu.com/s?ie=utf-8&wd={0}".format(kwd) html = self.get_html(url) encrypt_url_list = self.get_encrpt_urls(html) real_url_list = self.get_real_urls(encrypt_url_list) domain_list = self.get_domains(real_url_list) if domain_list: try: threadLock.acquire() for domain in domain_list: result[domain] = result[domain]+1 if domain in result else 1 success_num += 1 print('查询成功{0}个'.format(success_num)) finally: threadLock.release() del kwd gc.collect() except Exception as e: print(e) finally: q.task_done() # 保存数据 @staticmethod def save(): print ('开始save.....') res_sort = sorted(result.items(), key=lambda s: s[1], reverse=True) print(res_sort) with open('bdpc_result1.txt','w',encoding="utf-8") as f: for domain,value in res_sort: f.write(str(domain)+' '+str(value)+' ') if __name__ == "__main__": start = time.time() user_agent = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'} threadLock = threading.Lock() # 锁 result = {} # 初始结果保存字典 success_num = 0 # 查询成功个数 q = bdpcCover.read_file('kwd.txt') all_num = q.qsize() #总词数 # 设置线程数 for i in list(range(5)): t = bdpcCover() t.setDaemon(True) t.start() q.join() # 结果保存文件 bdpcCover.save() end = time.time() print(' 关键词共{0}个,查询成功{1}个,耗时{2}min'.format(all_num,success_num,(end-start)/60) )
m.58.com 111 www.guazi.com 110 anshan.baixing.com 80 as.58.com 70 anshan.ganji.com 66 www.xin.com 60 www.iautos.cn 54 www.che168.com 48 3g.ganji.com 47 www.58.com 33 so.iautos.cn 33 map.baidu.com 31 www.taoche.com 30 www.hx2car.com 26 m.iautos.cn 21
百度PC关键词覆盖率1的代码大家可以测试下,有问题反馈哦。