Source code for pisak.blog.html_parsers

"""
Set of various HTML parsers.
"""
from bs4 import BeautifulSoup


[docs]def apply_linebreaks(text):
    """
    Convert python-style linebreaks to a html-style ones.
    
    :param text: text with python-style linebreaks.

    :return: text with html-style linebreaks.
    """
    line_break = "<br>"
    return text.replace("\n", line_break)


[docs]def apply_paragraphs(text):
    """
    Apply html-style paragraphs to the text.
    
    :param text: text with python-style or no paragraphs.

    :return: text with html-style paragraphs.
    """
    paragraph_start = "<p>"
    paragraph_end = "</p>"
    text = paragraph_start + text + paragraph_end
    text.replace("\n\n", paragraph_end + paragraph_start)
    return text


[docs]def embed_images(content, image_urls):
    """
    Embed img tag with image url into the html content.

    :param content: html text.
    :param image_urls: url to the image or list of urls.

    :return: content with img tag embedded.
    """
    line_break = "<br>"
    separator = 2 * line_break
    if isinstance(image_urls, str):
        image_urls = [image_urls]
    image_urls = ["<img src='{}' >".format(image_url) for image_url in image_urls]
    return content + separator + separator.join(image_urls)


[docs]def delete_images(content):
    """
    Remove all the img tags.

    :param content: html text.

    :return: content with img tags removed.
    """
    parser = BeautifulSoup(content)
    for img in parser.find_all("img"):
        img.replace_with("")
    return parser.prettify()


[docs]def list_images(content):
    """
    Extract and list all images in a html text.

    :param content: html text.

    :return: list of image urls.
    """
    return [img.get("src") for img in
            BeautifulSoup(content).find_all("img")]


[docs]def extract_text(content):
    """
    Get all plain text from a html document.

    :param content: html text.

    :return: plain text.
    """
    parser = BeautifulSoup(content)
    convert_linebreaks(parser)
    return parser.get_text().strip()


[docs]def convert_linebreaks(parser):
    """
    Converts all html-style linebreaks to the python-style ones.

    :param parser: beautiful soup parser.
    """
    for break_line in parser.find_all("br"):
        break_line.replace_with("\n")