python爬虫如何爬知乎的话题?

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

#coding:utf-8 from fileinput import filename __author__ = 'haoning' __crawler for #!/usr/bin/env python import urllib import urllib2 import time import re import json import uuid import platform import os import sys import cookielib import MySQLdb as mdb from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding( "utf-8" ) headers = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With':'XMLHttpRequest', 'Referer':'https://www.zhihu.com/topics', 'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a' } DB_HOST = '127.0.0.1' DB_USER = 'root' DB_PASS = 'root' conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8') conn.autocommit(False) curr = conn.cursor() def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,20) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read() return html except: print "timeout" return None def getTopics(): url = 'https://www.zhihu.com/topics' print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read().decode('utf-8') print html soup = BeautifulSoup(html) lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'}) for li in lis: data_id=li.get('data-id') name=li.text curr.execute('select id from classify_new where name=%s',(name)) y= curr.fetchone() if not y: curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name)) conn.commit() except Exception as e: print "get topic error",e def get_extension(name): where=name.rfind('.') if where!=-1: return name[where:len(name)] return None def which_platform(): sys_str = platform.system() return sys_str def GetDateString(): when=time.strftime('%Y-%m-%d',time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + '//' + GetDateString() + '//' +classify if which_platform()=="Linux": newFolderName=par + 'http://www.likecs.com/' + GetDateString() + "http://www.likecs.com/" +classify if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,15) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name #print "filename",filename try: if "e82bab09c_xs" not in str(url): if not os.path.exists(filename): file_object = open(filename,'w+b') file_object.write(dataimg) file_object.close() return GetDateString()+'http://www.likecs.com/'+classify+"http://www.likecs.com/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #濡傛灉娌℃湁涓嬭浇涓嬫潵灏卞埄鐢ㄥ師鏉ョ綉绔欑殑閾炬帴 def get_topis(top_id,topic_name): url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' isGet = True; offset = -20; top_id=str(top_id) while isGet: offset = offset + 20 values = {'method': 'next', 'params': '{"topic_id":'+top_id+',"offset":'+str(offset)+',"hash_id":""}'} try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request) html=response.read().decode('utf-8') if html is None: return json_str = json.loads(html) ms=json_str['msg'] if len(ms) <5: break msg=ms[0] #print msg soup = BeautifulSoup(str(msg)) blks = soup.find_all('div', {'class' : 'blk'}) for blk in blks: page=blk.find('a').get('href') if page is not None: node=page.replace("/topic/","") print node,page except urllib2.URLError, e: print "error is",e pass def work(): #getTopics() #鑾峰緱璇濋 curr.execute('select data_id,name from classify_new') results = curr.fetchall() for r in results: data_id=r[0] name=r[1] get_topis(data_id,name) if __name__ == '__main__': i=0 while i< 40: work() i=i+1

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wpwyyz.html