beautifulsoup scraping paragraphs from html

Solutions on MaxInterview for beautifulsoup scraping paragraphs from html by the best coders in the world

showing results for - "beautifulsoup scraping paragraphs from html"
Claudia
30 Aug 2018
1from bs4 import BeautifulSoup
2
3# Simple HTML
4SIMPLE_HTML = '''<html>
5<head></head>
6<body>
7<h1>This is a title</h1>
8<p class="subtitle">Lorem ipsum dolor sit amet.</p>
9<p>Here's another p without a class</p>
10<ul>
11    <li>Sarah</li>
12    <li>Mary</li>
13    <li>Charlotte</li>
14    <li>Carl</li>
15</ul>
16</body>
17</html>'''
18
19simple_soup = BeautifulSoup(SIMPLE_HTML, 'html.parser')      # use html.parser in order to understand the simple HTML
20
21# Find paragraph
22def find_paragraph():
23    print(simple_soup.find('p', {'class': 'subtitle'}).string)
24
25
26def find_other_paragraph():
27    paragraphs = simple_soup.find_all('p')                                                     # give all the paragraphs
28    other_paragraph = [p for p in paragraphs if 'subtitle' not in p.attrs.get('class', [])]    # iterate over the paragraphs and give back if not a class paragraph
29    print(other_paragraph[0].string)                                                           # attrs.get() give back None if paragraph not found
30                                                                                               # instead of None we return an empty list [] is case paragraph not found
31    
32find_paragraph() 
33find_other_paragraph()