就找了多个例证练习go语言爬取豆瓣电影top250,

2019-09-21 07:58栏目:大奖888官网登录
TAG:

  方今读书go,就找了三个例证练习go语言爬取豆瓣电影top250,思路大约正是获取网页,然后遵照页面成分,用正则表明式相称电影名称、评分、商量人数。原版的书文有个地点供给修改下pattern4 := <img width="100" alt="图片 1" src=,那样就能够运转了图片 2
那些例子能够由修改下成为并发的格局,进步品质(参谋golang 并发 chan)
```
var sem chan int = make(chan int,10);
for i := 0; i < 10; i++ {
go func {
header := map[string]string{
"Host": "movie.douban.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
"Referer": "",
}
fmt.Println("正在抓取第" + strconv.Itoa + "页......")
url := "" + strconv.Itoa + "&filter="
spider := &Spider{url, header}
html := spider.get_html_header()

Python 2.7 依照顺序提示 输入账号密码之后 可以得到到腾讯网热门新闻的标题链接。如果想博得天涯论坛其余音信方可活动修改。

求助!!怎样用python 爬虫搜罗佳缘的顾客消息
三弟初学python,写了几个前后相继想抓取佳缘网址的顾客音讯,不过正则相称之后是none,想请教各位大神原因在哪import urllib2,urllibfrom urllib import urlencodeimport re,sys,os,time,Queue,thread,cookielibtesturl = ' = ' = '133574961'passwd = 'lixiaoming'values = {'password':passwd,'name':uid}postdata = urllib.urlencodecookie = cookielib.CookieJar()urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor,urllib2.HTTPHandler)headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8', }req = urllib2.Request( url = '', data = postdata, headers = headers)urlopener.openpage = urlopener.open.read()unicodePage = page.decode#print unicodePagerst = re.compile('

',re.S)items = rst.findall(unicodePage)print itemsfor item in items: print item[0],item[1]

        pattern2 := `评价`        rp2 := regexp.MustCompile        find_txt2 := rp2.FindAllStringSubmatch                pattern3 := `property="v:average">`        rp3 := regexp.MustCompile        find_txt3 := rp3.FindAllStringSubmatch        pattern4 := `<img width="100" alt="图片 3" src=`        rp4 := regexp.MustCompile        find_txt4 := rp4.FindAllStringSubmatch        for i := 0; i < len(find_txt2); i++ {            fmt.Printf("%s %s %sn", find_txt4[i][1], find_txt3[i][1], find_txt2[i][1], )            f.WriteString(find_txt4[i][1] + "t" + find_txt3[i][1] + "t" + find_txt2[i][1] + "t" + "rn")        }        sem <- 0    }}for i := 0; i < 10; i++ { <-sem }close

![并发效果截图](https://user-gold-cdn.xitu.io/2018/3/20/162429602098ad78?w=719&h=302&f=png&s=10912)&emsp;&emsp;到这里go爬虫部分已经介绍完毕,百无聊赖之际又写了一个python版,python很简洁

直白上代码啦

<code>
import re
import requests
import cookielib
from PIL import Image
import time
import json
import webbrowser
from attr import attrib
from lxml import etree
import urllib2
import urlparse
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
filename = 'cookie'
session = requests.Session()
session.cookies = cookielib.CookieJar()
try:
session.cookies.load(filename=filename, ignore_discard=True)
except:
print('cookie fail')
# <input type="hidden" name="_xsrf" value="f1f90f1cfe8ec5c732ef0d8833ccabe8"/>
def get_xsrf():
response = session.get('https://www.zhihu.com', headers=headers)
html = response.text
get_xsrf_pattern = re.compile(r'<input type="hidden" name="_xsrf" value="(.*?)"')
_xsrf = re.findall(get_xsrf_pattern, html)[0]
return _xsrf
def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r='

  • t + "&type=login"
    response = session.get(captcha_url, headers=headers)
    with open('cptcha.gif', 'wb') as f:
    f.write(response.content)
    im = Image.open('cptcha.gif')
    im.show()
    captcha = raw_input('Verification code:')
    print captcha
    return captcha
    def login(username, password):
    if re.match(r'd{11}$', account):
    print('phone logining')
    url = 'http://www.zhihu.com/login/phone_num'
    data = {'_xsrf': get_xsrf(),
    'password': password,
    'remember_me': 'true',
    'phone_num': username
    }
    else:
    print('email longing')
    url = 'https://www.zhihu.com/login/email'
    data = {'_xsrf': get_xsrf(),
    'password': password,
    'remember_me': 'true',
    'email': username
    }
    data['captcha'] = get_captcha()
    result = session.post(url, data=data, headers=headers)
    print((json.loads(result.text))['msg']+' codeLogin')
    # session.cookies.save(ignore_discard=True, ignore_expires=True)
    def nextMore(offset, start):
    url = 'https://www.zhihu.com/node/TopStory2FeedList'
    data = {'params': {'offset':offset, 'start':start},
    'method': 'next'
    }
    result = session.post(url, data=data, headers=headers)
    print((json.loads(result.text))['msg'] + ' ')
    def download(url, headers, proxy, num_retries, data=None):
    headers = headers or {}
    print 'Downloading:', url
    request = urllib2.Request(url, data, headers)
    opener = urllib2.build_opener()
    if proxy:
    proxy_params = {urlparse.urlparse(url).scheme: proxy}
    opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
    response = opener.open(request)
    html = response.read()
    code = response.code
    except urllib2.URLError as e:
    print 'Download error:', e.reason
    html = ''
    if hasattr(e, 'code'):
    code = e.code
    if num_retries > 0 and 500 <= code < 600:
    # retry 5XX HTTP errors
    return download(url, headers, proxy, num_retries - 1, data)
    else:
    code = None
    return html
    if name == 'main':
    account = raw_input('account:')
    secret = raw_input('password:')
    login(account, secret)
    get_url = 'https://www.zhihu.com/explore/recommendations'
    resp = session.get(get_url, headers=headers, allow_redirects=False)
    page = etree.HTML(resp.text)
    i = 1
    while (i<6):
    string = "//div[@id='zh-recommend']/div[2]/div[1]/div[" + str(i)
  • "]/h2/a"
    hrefs = page.xpath(string)
    for href in hrefs:
    print href.text + 'n' + 'https://www.zhihu.com'
  • str(href.attrib['href'])
    url = 'https://www.zhihu.com'
  • str(href.attrib['href'])
    i = i + 1
    webbrowser.open(get_url, new=0, autoraise=True)
    </code>

coding=utf-8

import re
import urllib2
import datetime
def getDouban:
print "爬取第" + str+"页"
html = "" + str + "&filter="
try:
page = urllib2.urlopen(html, timeout=3)
result = page.read()
score = re.findall('property="v:average">',result)
person = re.findall('(.?)评价',result)
name= re.findall('<img width="100" alt="" src=', result)
j=0
while j<len:
print name[j], score[j]+'分', person[j]
j=j+1
except:
print i
starttime = datetime.datetime.now()
params=[]
for i in range:
getDouban
endtime = datetime.datetime.now()
print "爬虫历时"+str((endtime-starttime).seconds)+"s达成"
```
图片 4

By 戴近视镜的莫林

版权声明:本文由大奖888-www.88pt88.com-大奖888官网登录发布于大奖888官网登录,转载请注明出处:就找了多个例证练习go语言爬取豆瓣电影top250,