beautifulsoup¶
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
pip install beautifulsoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_text)
links = [element.get('href') for element in soup.find_all('a')]
BeautifulSoup()¶
- class
beautifulsoup.
BeautifulSoup
(html_text: str, parser: str = 'html.parser')¶ Парсер html
soup = BeautifulSoup(some_html_string) soup = BeautifulSoup(some_html_string, 'html.parser') soup = BeautifulSoup(some_html_string, 'lxml') soup = BeautifulSoup(some_html_string, 'lxml-xml') soup = BeautifulSoup(some_html_string, 'html5lib')
-
body
¶ Возвращает
beautifulsoup.element.Tag
-
head
¶ Возвращает
beautifulsoup.element.Tag
-
title
¶ Возвращает
beautifulsoup.element.Tag
-
get_test
()¶ Возвращает строку, весь текст, без html страницы
-
find
(name=None, attributes={}, recursive=True, text=None, *kwargs)¶ name = None
attributes = {}
recursive = True
text = None
id
string
Возвращает первый найденный элемент,
beautifulsoup.element.Tag
elem = soup.find(id='myId') elem = soup.find('h2', string='Python') elem = soup.find('h2', string=lambda text: 'Python' in text)
-
findAll
(name=None, attributes={}, recursive=True, text=None, limit=None, *kwargs) → :py:class:`beautifulsoup.element.ResultSet`¶ Поиск элементов на странице
span_list = bs_obj.findAll('span', {'class': 'green'}) for span in span_list: print(span.get_text()) hs = bs_obj.findAll({'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}) id_text_elem = bs_obj.findAll(id='text') imgs = bs_obj.findAll('img', {'src': re.compile('\.\.\/img\/*\.jpg')}) imgs = bs_obj.findAll(lambda tag: len(tag.attrs) == 2)
-
prettify
() → str¶ Возвращает строку, отформатированныую строку содержимого
print(soup.prettify())
-