diff --git a/html_cleanup/views/default.py b/html_cleanup/views/default.py index a164513..1fdd2e6 100644 --- a/html_cleanup/views/default.py +++ b/html_cleanup/views/default.py @@ -8,6 +8,7 @@ import datetime import time import charset_normalizer import html +import re @view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2') def home(request): @@ -182,6 +183,7 @@ def clean_html(input_file, output_file): if tag: tag[0] = tag[0].lower() + id_match_regex=r'.*(id=\"\w+\").*' if tag[0] in ["table", "div", "img", "a"]: line_out += "<{}".format(s) @@ -191,18 +193,29 @@ def clean_html(input_file, output_file): else: # sinon contine de recopier la ligne skip_tag = False - elif tag[0] in ["p"]: - line_out += "
" - elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", - "ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub", - "sup", "/sup", "u", "/u"]: + + elif tag[0] in ["span", "p", "h1", "h2", "h3", "h4", "h5", "h6"]: + line_out += "<{}".format(tag[0]) + if tag[0] == "p": + # ajouter text-align dans p + line_out += ' style="text-align:justify;"' + if len(tag) > 1: + matched = re.match(id_match_regex, line_in) + if matched: + line_out += ' '+ matched.group(1) + line_out += '>' + + elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", "ol", "/ol", + "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", "u", "/u"]: line_out += "<{}>".format(tag[0]) - elif tag[0] in ["h1", "/h1", "h2", "/h2", "h3", "/h3", "h4", "/h4", "h5", - "/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div", - "/img", "/a"]: + + elif tag[0] in ["/span", "/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", "body", "/body", + "/html", "/div", "/img", "/a"]: line_out += "<{}>".format(tag[0]) + elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]: line_out += "<{}>".format(tag[0]) + elif tag[0] in ["br", "br/"]: line_out += "<{}>".format(tag[0]) diff --git a/setup.py b/setup.py index 8818457..04b339d 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ requires = [ 'pyramid_jinja2', 'pyramid_debugtoolbar', 'python-magic', - 'html', + 'html3', 'charset_normalizer', 'waitress', ] @@ -27,7 +27,7 @@ tests_require = [ setup( name='html_cleanup', - version='1.0', + version='1.1', description='html_cleanup', long_description=README + '\n\n' + CHANGES, classifiers=[