added id in tag

This commit is contained in:
Phuoc Cao
2025-12-20 11:38:47 +01:00
parent 76f840b48c
commit 785f69b59b
2 changed files with 23 additions and 10 deletions

View File

@@ -8,6 +8,7 @@ import datetime
import time import time
import charset_normalizer import charset_normalizer
import html import html
import re
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2') @view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
def home(request): def home(request):
@@ -182,6 +183,7 @@ def clean_html(input_file, output_file):
if tag: if tag:
tag[0] = tag[0].lower() tag[0] = tag[0].lower()
id_match_regex=r'.*(id=\"\w+\").*'
if tag[0] in ["table", "div", "img", "a"]: if tag[0] in ["table", "div", "img", "a"]:
line_out += "<{}".format(s) line_out += "<{}".format(s)
@@ -191,18 +193,29 @@ def clean_html(input_file, output_file):
else: else:
# sinon contine de recopier la ligne # sinon contine de recopier la ligne
skip_tag = False skip_tag = False
elif tag[0] in ["p"]:
line_out += "<p align=\"justify\">" elif tag[0] in ["span", "p", "h1", "h2", "h3", "h4", "h5", "h6"]:
elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", line_out += "<{}".format(tag[0])
"ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub", if tag[0] == "p":
"sup", "/sup", "u", "/u"]: # ajouter text-align dans p
line_out += ' style="text-align:justify;"'
if len(tag) > 1:
matched = re.match(id_match_regex, line_in)
if matched:
line_out += ' '+ matched.group(1)
line_out += '>'
elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", "ol", "/ol",
"ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", "u", "/u"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])
elif tag[0] in ["h1", "/h1", "h2", "/h2", "h3", "/h3", "h4", "/h4", "h5",
"/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div", elif tag[0] in ["/span", "/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", "body", "/body",
"/img", "/a"]: "/html", "/div", "/img", "/a"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])
elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]: elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])
elif tag[0] in ["br", "br/"]: elif tag[0] in ["br", "br/"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])

View File

@@ -14,7 +14,7 @@ requires = [
'pyramid_jinja2', 'pyramid_jinja2',
'pyramid_debugtoolbar', 'pyramid_debugtoolbar',
'python-magic', 'python-magic',
'html', 'html3',
'charset_normalizer', 'charset_normalizer',
'waitress', 'waitress',
] ]
@@ -27,7 +27,7 @@ tests_require = [
setup( setup(
name='html_cleanup', name='html_cleanup',
version='1.0', version='1.1',
description='html_cleanup', description='html_cleanup',
long_description=README + '\n\n' + CHANGES, long_description=README + '\n\n' + CHANGES,
classifiers=[ classifiers=[