Asyncio is used to perform Asynchronous Programming in Python. It was introduced in Python 3.4 and has evolved much since than. We would be using Python 3.7 for this project in which we will go through the practical implementation of AsyncIO.
We would be mining and scraping the web and would compare time consumed using different approach. We would go through following steps:
- Synchronous Single Task
- Asynchronous Single Task
- Synchronous Multiple Tasks
- Asynchronous Multiple Tasks
import requests
from bs4 import BeautifulSoup
import time
Synchronous Single Task
start = time.time()
def tic():
return 'at %1.1f seconds' % (time.time() - start)
print('Started work: {}'.format(tic()))
texts = []
page = 'http://python.org.pk'
#page = 'http://indeed.com/jobs?q=python&start='+str(index)
print(page)
# identify the url of the job listings
web_result = requests.get(page).text
# use requests to actually visit the url
soup = BeautifulSoup(web_result)
# parse the html of the resulting page
for listing in soup.findAll('span', {'class':'summary'}):
# for each listing on the page
texts.append(listing.text)
print('Ended work: {}'.format(tic()))
#print(web_result)
Started work: at 0.0 seconds http://python.org.pk Ended work: at 5.8 seconds Asynchronous Single Task
import aiohttp
import asyncio
import async_timeout
import time
url_list = ['http://python.org.pk']
start = time.time()
def tic():
return 'at %1.1f seconds' % (time.time() - start)
async def fetch(session, url):
with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def main(urls):
async with aiohttp.ClientSession() as session:
#html = await fetch(session, 'http://python.org.pk')
#print(html)
results = await asyncio.gather(*[fetch(session, url) for url in urls],
return_exceptions=True)
print('2')
raw_result = dict(zip(urls, results))
return raw_result
print('Started work: {}'.format(tic()))
loop = asyncio.get_event_loop()
raw_result = loop.run_until_complete(main(url_list))
print('Ended work: {}'.format(tic()))
Started work: at 0.0 seconds 2 Ended work: at 0.8 seconds Synchronous Multiple Tasks
start = time.time()
def tic():
return 'at %1.1f seconds' % (time.time() - start)
print('Started work: {}'.format(tic()))
texts = []
# hold our job descriptions in this list
for index in range(0,100,10): # go through 10 pages of indeed
page = 'http://indeed.com/jobs?q=data+scientist&start='+str(index)
#page = 'http://indeed.com/jobs?q=python&start='+str(index)
print(page)
# identify the url of the job listings
web_result = requests.get(page).text
# use requests to actually visit the url
soup = BeautifulSoup(web_result)
# parse the html of the resulting page
for listing in soup.findAll('span', {'class':'summary'}):
# for each listing on the page
texts.append(listing.text)
# append the text of the listing to our list
print('Ended work: {}'.format(tic()))
Started work: at 0.0 seconds http://indeed.com/jobs?q=data+scientist&start=0 http://indeed.com/jobs?q=data+scientist&start=10 http://indeed.com/jobs?q=data+scientist&start=20 http://indeed.com/jobs?q=data+scientist&start=30 http://indeed.com/jobs?q=data+scientist&start=40 http://indeed.com/jobs?q=data+scientist&start=50 http://indeed.com/jobs?q=data+scientist&start=60 http://indeed.com/jobs?q=data+scientist&start=70 http://indeed.com/jobs?q=data+scientist&start=80 http://indeed.com/jobs?q=data+scientist&start=90 Ended work: at 28.0 seconds
# Uncomment the following line if you want to see the text returned
#print(texts)
Asynchronous Multiple Tasks
url_list = []
for index in range(0,100,10): # go through 10 pages of indeed
page = 'http://indeed.com/jobs?q=data+scientist&start='+str(index)
#page = 'http://indeed.com/jobs?q=python&start='+str(index)
url_list.append(page)
url_list
['http://indeed.com/jobs?q=data+scientist&start=0', 'http://indeed.com/jobs?q=data+scientist&start=10', 'http://indeed.com/jobs?q=data+scientist&start=20', 'http://indeed.com/jobs?q=data+scientist&start=30', 'http://indeed.com/jobs?q=data+scientist&start=40', 'http://indeed.com/jobs?q=data+scientist&start=50', 'http://indeed.com/jobs?q=data+scientist&start=60', 'http://indeed.com/jobs?q=data+scientist&start=70', 'http://indeed.com/jobs?q=data+scientist&start=80', 'http://indeed.com/jobs?q=data+scientist&start=90']
import aiohttp
import asyncio
import async_timeout
import time
start = time.time()
def tic():
return 'at %1.1f seconds' % (time.time() - start)
async def fetch(session, url):
with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def main(urls):
async with aiohttp.ClientSession() as session:
#html = await fetch(session, 'http://python.org.pk')
#print(html)
results = await asyncio.gather(*[fetch(session, url) for url in urls],
return_exceptions=True)
print('2')
raw_result = dict(zip(urls, results))
return raw_result
print('Started work: {}'.format(tic()))
loop = asyncio.get_event_loop()
raw_result = loop.run_until_complete(main(url_list))
print('Ended work: {}'.format(tic()))
Started work: at 0.0 seconds 2 Ended work: at 3.3 seconds
# Uncomment following line to see the raw result
#raw_result
from bs4 import BeautifulSoup
texts = []
for k,v in raw_result.items():
web_result = v
soup = BeautifulSoup(web_result)
for listing in soup.findAll('span', {'class':'summary'}):
texts.append(listing.text)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,2), stop_words='english')
matrix = vect.fit_transform(texts)
# fit and learn to the vocabulary in the corpus
print(len(vect.get_feature_names())) # how many features are there
freqs = [(word, matrix.getcol(idx).sum()) for word, idx in vect.vocabulary_.items()]
#sort from largest to smallest
for phrase, times in sorted (freqs, key = lambda x: -x[1])[:25]:
print(phrase, times)
1407 data 150 learning 44 machine 42 machine learning 41 analytics 31 insights 23 scientist 20 data scientist 19 experience 18 predictive 18 science 16 scientists 16 analysis 15 team 15 data scientists 14 looking 13 mining 13 modeling 12 data mining 12 learning data 12 statistics 11 data science 11 development 10 tools 10 statistical 10
url_list
['http://indeed.com/jobs?q=data+scientist&start=0', 'http://indeed.com/jobs?q=data+scientist&start=10', 'http://indeed.com/jobs?q=data+scientist&start=20', 'http://indeed.com/jobs?q=data+scientist&start=30', 'http://indeed.com/jobs?q=data+scientist&start=40', 'http://indeed.com/jobs?q=data+scientist&start=50', 'http://indeed.com/jobs?q=data+scientist&start=60', 'http://indeed.com/jobs?q=data+scientist&start=70', 'http://indeed.com/jobs?q=data+scientist&start=80', 'http://indeed.com/jobs?q=data+scientist&start=90']