Source code for pisak.blog.html_parsers
"""
Set of various HTML parsers.
"""
from bs4 import BeautifulSoup
[docs]def apply_linebreaks(text):
"""
Convert python-style linebreaks to a html-style ones.
:param text: text with python-style linebreaks.
:return: text with html-style linebreaks.
"""
line_break = "<br>"
return text.replace("\n", line_break)
[docs]def apply_paragraphs(text):
"""
Apply html-style paragraphs to the text.
:param text: text with python-style or no paragraphs.
:return: text with html-style paragraphs.
"""
paragraph_start = "<p>"
paragraph_end = "</p>"
text = paragraph_start + text + paragraph_end
text.replace("\n\n", paragraph_end + paragraph_start)
return text
[docs]def embed_images(content, image_urls):
"""
Embed img tag with image url into the html content.
:param content: html text.
:param image_urls: url to the image or list of urls.
:return: content with img tag embedded.
"""
line_break = "<br>"
separator = 2 * line_break
if isinstance(image_urls, str):
image_urls = [image_urls]
image_urls = ["<img src='{}' >".format(image_url) for image_url in image_urls]
return content + separator + separator.join(image_urls)
[docs]def delete_images(content):
"""
Remove all the img tags.
:param content: html text.
:return: content with img tags removed.
"""
parser = BeautifulSoup(content)
for img in parser.find_all("img"):
img.replace_with("")
return parser.prettify()
[docs]def list_images(content):
"""
Extract and list all images in a html text.
:param content: html text.
:return: list of image urls.
"""
return [img.get("src") for img in
BeautifulSoup(content).find_all("img")]
[docs]def convert_linebreaks(parser):
"""
Converts all html-style linebreaks to the python-style ones.
:param parser: beautiful soup parser.
"""
for break_line in parser.find_all("br"):
break_line.replace_with("\n")