来源:python中国网 时间:2019-11-13

  在SEO人员那里,判断一个行业哪些站做的好,有一个大众比较认可的思路。找一批行业词,查询每个词百度排名前10的url,然后提取下来,最后统计下哪个域名出现次数多。出现次数多的几个域名就是这个行业的优质站点。

  具体到搜索引擎搜索展现的实际情况,其实有2种方式,

  1)查询一批词,可以统计某域名出现次数,然后最终做计算。(一个关键词同一个域名出现N个url排名计算N次)

  2)查询一批词,也可以只计算这批词排在首页词数来筛选域名。(一个关键词同一个域名出现N个url排名计算1次,相当于计算首页词数)

  本文按照第1种方式来计算,准备kwd.txt,一行一个关键词。

  选取少部分词测试下,实际工作为了科学准确需要选大量行业词,代码如下。

# ‐*‐ coding: utf‐8 ‐*‐
"""
一个关键词serp上同一个域名出现N个url排名 计算N次
"""
import requests
from pyquery import PyQuery as pq
import threading
import queue
import time
from urllib.parse import urlparse
import gc

class bdpcCover(threading.Thread):

    def __init__(self):
        threading.Thread.__init__(self)

    # 读取txt文件 关键词进入队列
    @staticmethod
    def read_file(filepath):
        q = queue.Queue()
        for kwd in open(filepath,encoding='utf-8'):
            kwd = kwd.strip()
            q.put(kwd)
        return q

    # 获取某词serp源码
    def get_html(self,url,retry=2):
        try:
            r = requests.get(url=url,headers=user_agent,timeout=5)
        except Exception as e:
            print('获取源码失败',e)
            if retry > 0:
                self.get_html(url,retry-1)
        else:
            html = r.content.decode('utf-8') # 直接 r.text有时识别错误
            return html

    # 获取某词serp源码上自然排名的所有url
    def get_encrpt_urls(self,html):
        encrypt_url_list = []
        if html and '_百度搜索' in html:
            doc = pq(html)
            try:
                a_list = doc('.t a').items()
            except Exception as e:
                print('未提取到serp上的解密url', e, url)
            else:
                for a in a_list:
                    encrypt_url = a.attr('href')
                    if encrypt_url.find('http://www.baidu.com/link?url=') == 0:
                        encrypt_url_list.append(encrypt_url)
        return encrypt_url_list

    # 解密某条加密url
    def decrypt_url(self,encrypt_url,retry=1):
        try:
            encrypt_url = encrypt_url.replace('http://','https://')
            r = requests.head(encrypt_url,headers=user_agent)
        except Exception as e:
            print(encrypt_url,'解密失败',e)
            if retry > 0:
                self.decrypt_url(encrypt_url,retry-1)
        else:
            return r.headers['Location']

    # 获取某词serp源码首页排名真实url
    def get_real_urls(self,encrypt_url_list):
        real_url_list = [self.decrypt_url(encrypt_url) for encrypt_url in encrypt_url_list]
        return real_url_list

    # 提取某条url域名部分
    def get_domain(self,real_url):
        try:
           res = urlparse(real_url)
        except Exception as e:
           print (e,real_url)
           domain = "xxx"
        else:
           domain = res.netloc
        return domain

    # 获取某词serp源码首页排名真实url的域名部分
    def get_domains(self,real_url_list):
            domain_list = [self.get_domain(real_url) for real_url in real_url_list]
            return domain_list

    # 线程函数
    def run(self):
        global success_num
        while 1:
            kwd = q.get()
            try:
                url = "https://www.baidu.com/s?ie=utf-8&wd={0}".format(kwd)
                html = self.get_html(url)
                encrypt_url_list = self.get_encrpt_urls(html)
                real_url_list = self.get_real_urls(encrypt_url_list)
                domain_list = self.get_domains(real_url_list)
                if domain_list:
                    try:
                        threadLock.acquire()
                        for domain in domain_list:
                            result[domain] = result[domain]+1 if domain in result else 1
                        success_num += 1
                        print('查询成功{0}个'.format(success_num))
                    finally:
                        threadLock.release()
                del kwd
                gc.collect()
            except Exception as e:
                print(e)
            finally:
                q.task_done()

    # 保存数据
    @staticmethod
    def save():
        print ('开始save.....')
        res_sort = sorted(result.items(), key=lambda s: s[1], reverse=True)
        print(res_sort)
        with open('bdpc_result1.txt','w',encoding="utf-8") as f:
            for domain,value in res_sort:
                    f.write(str(domain)+'	'+str(value)+'
')


if __name__ == "__main__":
    start = time.time()

    user_agent = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'}
    threadLock = threading.Lock()  # 锁
    result = {}   # 初始结果保存字典
    success_num = 0  # 查询成功个数
    q = bdpcCover.read_file('kwd.txt')
    all_num = q.qsize() #总词数

    # 设置线程数
    for i in list(range(5)):
        t = bdpcCover()
        t.setDaemon(True)
        t.start()
    q.join()

    # 结果保存文件
    bdpcCover.save()
    end = time.time()
    print('
关键词共{0}个,查询成功{1}个,耗时{2}min'.format(all_num,success_num,(end-start)/60) )

m.58.com    111
www.guazi.com   110
anshan.baixing.com  80
as.58.com   70
anshan.ganji.com    66
www.xin.com 60
www.iautos.cn   54
www.che168.com  48
3g.ganji.com    47
www.58.com  33
so.iautos.cn    33
map.baidu.com   31
www.taoche.com  30
www.hx2car.com  26
m.iautos.cn 21


  百度PC关键词覆盖率1的代码大家可以测试下,有问题反馈哦。