← args and kwargs in Python : The Mystery of Asterisks in Python How to Use Pandas Get Dummies in Python →

Hands On AsyncIO in Python : Project based approach for Asynchronous Programming in Python

by techco | Nov 18, 2022 | Uncategorized | 0 comments

Asyncio is used to perform Asynchronous Programming in Python. It was introduced in Python 3.4 and has evolved much since than. We would be using Python 3.7 for this project in which we will go through the practical implementation of AsyncIO.

We would be mining and scraping the web and would compare time consumed using different approach. We would go through following steps:

Synchronous Single Task
Asynchronous Single Task
Synchronous Multiple Tasks
Asynchronous Multiple Tasks

import requests
from bs4 import BeautifulSoup
import time

Synchronous Single Task

start = time.time()

def tic():
    return 'at %1.1f seconds' % (time.time() - start)

print('Started work: {}'.format(tic()))

texts = []
page = 'http://python.org.pk'
#page = 'http://indeed.com/jobs?q=python&start='+str(index)
print(page)
# identify the url of the job listings
web_result = requests.get(page).text
# use requests to actually visit the url
soup = BeautifulSoup(web_result)
# parse the html of the resulting page
for listing in soup.findAll('span', {'class':'summary'}):
    # for each listing on the page
    texts.append(listing.text)

print('Ended work: {}'.format(tic()))
#print(web_result)

Started work: at 0.0 seconds
http://python.org.pk
Ended work: at 5.8 seconds

Asynchronous Single Task

import aiohttp
import asyncio
import async_timeout
import time

url_list = ['http://python.org.pk']
start = time.time()


def tic():
    return 'at %1.1f seconds' % (time.time() - start)

async def fetch(session, url):
    with async_timeout.timeout(10):
        async with session.get(url) as response:
            return await response.text()

async def main(urls):
    async with aiohttp.ClientSession() as session:
        #html = await fetch(session, 'http://python.org.pk')
        #print(html)
        results = await asyncio.gather(*[fetch(session, url) for url in urls], 
                                       return_exceptions=True)
        print('2')
        raw_result = dict(zip(urls, results))
        return raw_result

print('Started work: {}'.format(tic()))
loop = asyncio.get_event_loop()
raw_result = loop.run_until_complete(main(url_list))
print('Ended work: {}'.format(tic()))

Started work: at 0.0 seconds
2
Ended work: at 0.8 seconds

Synchronous Multiple Tasks

start = time.time()


def tic():
    return 'at %1.1f seconds' % (time.time() - start)

print('Started work: {}'.format(tic()))
texts = []
# hold our job descriptions in this list
for index in range(0,100,10): # go through 10 pages of indeed
    page = 'http://indeed.com/jobs?q=data+scientist&start='+str(index)
    #page = 'http://indeed.com/jobs?q=python&start='+str(index)
    print(page)
    # identify the url of the job listings
    web_result = requests.get(page).text
    # use requests to actually visit the url
    soup = BeautifulSoup(web_result)
    # parse the html of the resulting page
    for listing in soup.findAll('span', {'class':'summary'}):
        # for each listing on the page
        texts.append(listing.text)
        # append the text of the listing to our list
print('Ended work: {}'.format(tic()))

Started work: at 0.0 seconds
http://indeed.com/jobs?q=data+scientist&start=0
http://indeed.com/jobs?q=data+scientist&start=10
http://indeed.com/jobs?q=data+scientist&start=20
http://indeed.com/jobs?q=data+scientist&start=30
http://indeed.com/jobs?q=data+scientist&start=40
http://indeed.com/jobs?q=data+scientist&start=50
http://indeed.com/jobs?q=data+scientist&start=60
http://indeed.com/jobs?q=data+scientist&start=70
http://indeed.com/jobs?q=data+scientist&start=80
http://indeed.com/jobs?q=data+scientist&start=90
Ended work: at 28.0 seconds

# Uncomment the following line if you want to see the text returned
#print(texts)

Asynchronous Multiple Tasks

url_list = []
for index in range(0,100,10): # go through 10 pages of indeed
    page = 'http://indeed.com/jobs?q=data+scientist&start='+str(index)
    #page = 'http://indeed.com/jobs?q=python&start='+str(index)
    url_list.append(page)

url_list

['http://indeed.com/jobs?q=data+scientist&start=0',
 'http://indeed.com/jobs?q=data+scientist&start=10',
 'http://indeed.com/jobs?q=data+scientist&start=20',
 'http://indeed.com/jobs?q=data+scientist&start=30',
 'http://indeed.com/jobs?q=data+scientist&start=40',
 'http://indeed.com/jobs?q=data+scientist&start=50',
 'http://indeed.com/jobs?q=data+scientist&start=60',
 'http://indeed.com/jobs?q=data+scientist&start=70',
 'http://indeed.com/jobs?q=data+scientist&start=80',
 'http://indeed.com/jobs?q=data+scientist&start=90']

import aiohttp
import asyncio
import async_timeout
import time

start = time.time()


def tic():
    return 'at %1.1f seconds' % (time.time() - start)

async def fetch(session, url):
    with async_timeout.timeout(10):
        async with session.get(url) as response:
            return await response.text()

async def main(urls):
    async with aiohttp.ClientSession() as session:
        #html = await fetch(session, 'http://python.org.pk')
        #print(html)
        results = await asyncio.gather(*[fetch(session, url) for url in urls], 
                                       return_exceptions=True)
        print('2')
        raw_result = dict(zip(urls, results))
        return raw_result

print('Started work: {}'.format(tic()))
loop = asyncio.get_event_loop()
raw_result = loop.run_until_complete(main(url_list))
print('Ended work: {}'.format(tic()))

Started work: at 0.0 seconds
2
Ended work: at 3.3 seconds

# Uncomment following line to see the raw result
#raw_result

from bs4 import BeautifulSoup

texts = []
for k,v in raw_result.items():
    web_result = v
    soup = BeautifulSoup(web_result)
    for listing in soup.findAll('span', {'class':'summary'}):
        texts.append(listing.text)

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,2), stop_words='english')

matrix = vect.fit_transform(texts)
# fit and learn to the vocabulary in the corpus

print(len(vect.get_feature_names()))  # how many features are there

freqs = [(word, matrix.getcol(idx).sum()) for word, idx in vect.vocabulary_.items()]
#sort from largest to smallest
for phrase, times in sorted (freqs, key = lambda x: -x[1])[:25]:
    print(phrase, times)

1407
data 150
learning 44
machine 42
machine learning 41
analytics 31
insights 23
scientist 20
data scientist 19
experience 18
predictive 18
science 16
scientists 16
analysis 15
team 15
data scientists 14
looking 13
mining 13
modeling 12
data mining 12
learning data 12
statistics 11
data science 11
development 10
tools 10
statistical 10

url_list

['http://indeed.com/jobs?q=data+scientist&start=0',
 'http://indeed.com/jobs?q=data+scientist&start=10',
 'http://indeed.com/jobs?q=data+scientist&start=20',
 'http://indeed.com/jobs?q=data+scientist&start=30',
 'http://indeed.com/jobs?q=data+scientist&start=40',
 'http://indeed.com/jobs?q=data+scientist&start=50',
 'http://indeed.com/jobs?q=data+scientist&start=60',
 'http://indeed.com/jobs?q=data+scientist&start=70',
 'http://indeed.com/jobs?q=data+scientist&start=80',
 'http://indeed.com/jobs?q=data+scientist&start=90']

Categories

Hands On AsyncIO in Python : Project based approach for Asynchronous Programming in Python