-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Expand file tree
/
Copy pathmultithreaded_class.py
More file actions
38 lines (32 loc) · 1.17 KB
/
multithreaded_class.py
File metadata and controls
38 lines (32 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import _thread
import time
visited = []
def getLinks(thread_name, bsObj):
print('Getting links in {}'.format(thread_name))
links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
return [link for link in links if link not in visited]
# Define a function for the thread
def scrape_article(thread_name, path):
visited.append(path)
html = urlopen('http://en.wikipedia.org{}'.format(path))
time.sleep(5)
bsObj = BeautifulSoup(html, 'html.parser')
title = bsObj.find('h1').get_text()
print('Scraping {} in thread {}'.format(title, thread_name))
links = getLinks(thread_name, bsObj)
if len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
print(newArticle)
scrape_article(thread_name, newArticle)
# Create two threads as follows
try:
_thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',))
_thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',))
except:
print ('Error: unable to start threads')
while 1:
pass