1
2import requests
3from bs4 import BeautifulSoup #pip install bs4
4
5url = 'https://www.troyhunt.com/the-773-million-record-collection-1-data-reach/'
6res = requests.get(url)
7html_page = res.content
8soup = BeautifulSoup(html_page, 'html.parser')
9text = soup.find_all(text=True)
10
11output = ''
12blacklist = [
13 '[document]',
14 'noscript',
15 'header',
16 'html',
17 'meta',
18 'head',
19 'input',
20 'script',
21 # there may be more elements you don't want, such as "style", etc.
22]
23
24for t in text:
25 if t.parent.name not in blacklist:
26 output += '{} '.format(t)
27
28print(output)
29