python BS4获取href网址
近期看那个scrape章节。有个s_urls[0][‘href‘] 没法理解。以为python 有非数字下标数组。后面多方查询才知道这个是beautifulsoup 中的tag查询
https://stackoverflow.com/questions/5815747/beautifulsoup-getting-href?noredirect=1&lq=1
from bs4 import BeautifulSoup
# what does Thread means
from threading import Thread
import urllib.request
#Location of restaurants
home_url="https://www.yelp.com"
find_what="Restaurants"
location="London"
#Get all restaurants that match the search criteria
#https://www.yelp.com/search?find_desc=Restaurants&find_loc=London
search_url="https://www.yelp.com/search?find_desc=" +find_what+"&find_loc="+location
s_html= urllib.request.urlopen(search_url).read() #urlopen(search_url).read()
print("here")
soups_s=BeautifulSoup(s_html,"lxml")
#Get URLs of top 10 Restaurants in London
s_urls=soups_s.select(‘.biz-name‘[:10])
print(len(s_urls))
print(s_urls)
url=[]
print(type(s_urls))
print(type(s_urls[0]))
print(s_urls[0])
print(s_urls[0][‘href‘])
for u in range(len(s_urls)):
url.append(home_url+s_urls[u][‘href‘])
#https://www.yelp.com/biz/duck-and-waffle-london-3?osq=Restaurants
print(url)
#Function that will do actual scraping job
def scrape(ur):
html=urllib.request.urlopen(ur).read()
soup=BeautifulSoup(html,"lxml")
title=soup.select(‘.biz-page-title‘)
saddress=soup.select(‘.street-address‘)
phone=soup.select(‘.biz-phone‘)
if title:
print("Title:",title[0].getText().strip())
if saddress:
print("Streeet Address:",saddress[0].getText().strip())
if phone:
print("Phone number:",phone[0].getText().strip())
print("---------------------")
threadlist=[]
i=0
#Making thereads to perform scraping
while(i<len(url)):
t=Thread(target=scrape,args=(url[i],))
t.start()
threadlist.append(t)
i=i+1
for t in threadlist:
t.join()
文章来自:http://www.cnblogs.com/uxiuxi/p/7451325.html