diff --git a/html_cleanup/views/default.py b/html_cleanup/views/default.py index 9d0d294..4d94340 100644 --- a/html_cleanup/views/default.py +++ b/html_cleanup/views/default.py @@ -33,7 +33,7 @@ def home(request): message = "Votre fichier est converti et nettoyé : " + output_name return { - 'page_title': "HTML cleanup 1.2", + 'page_title': "HTML cleanup 26.01", 'message': message, 'file_url': file_url, 'file_name': output_name, @@ -119,13 +119,37 @@ def process_file(request, input_file, input_name): return message, input_encoding, output_name def clean_html(input_file, output_file): + # # cleanup undesirable tags in html file + # + #20241118: thn: changed chardet to charset_normalizer + #20251216: thn: changed align:"justify" to style="text-align:justify;" in
,
+ #because it was deprecated in HTML 5
+ #accepted id in
+ #20251217: thn, replaced re.match by re.search
+ #20251223: thn, accepted hyphen in id. Avoid: [.:] because they are using in CSS
+ #20251224: check input syntax for id
+ #Only support
+ # if tag[0] == "p": + # line_out += ' style="text-align:justify;"' + if len(tag) > 1 and not skip_tag: matched = re.search(id_match_regex, line_in) if matched: - line_out += ' '+ matched.group(1) + if check_input_name(matched.group(1)): + line_out += ' '+ matched.group(1) line_out += '>' - elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", "ol", "/ol", - "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", "u", "/u"]: + elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "/li", "ol", "/ol", + "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", + "u", "/u"]: line_out += "<{}>".format(tag[0]) - - elif tag[0] in ["/span", "/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", "body", "/body", - "/html", "/div", "/img", "/a"]: + elif tag[0] in ["/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", + "body", "/body", "/html", "/div", "/img", "/a"]: line_out += "<{}>".format(tag[0]) - elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]: line_out += "<{}>".format(tag[0]) - - elif tag[0] in ["br", "br/"]: + elif tag[0] in ["br", "br/", "hr"]: line_out += "<{}>".format(tag[0]) if line_out != "": - fo.write(line_out + "\n") - + fo.write(line_out + "\n") + except Exception as e: message = "Error: {}".format(str(e))