diff --git a/html_cleanup/views/default.py b/html_cleanup/views/default.py index 9d0d294..4d94340 100644 --- a/html_cleanup/views/default.py +++ b/html_cleanup/views/default.py @@ -33,7 +33,7 @@ def home(request): message = "Votre fichier est converti et nettoyé : " + output_name return { - 'page_title': "HTML cleanup 1.2", + 'page_title': "HTML cleanup 26.01", 'message': message, 'file_url': file_url, 'file_name': output_name, @@ -119,13 +119,37 @@ def process_file(request, input_file, input_name): return message, input_encoding, output_name def clean_html(input_file, output_file): + # # cleanup undesirable tags in html file + # + #20241118: thn: changed chardet to charset_normalizer + #20251216: thn: changed align:"justify" to style="text-align:justify;" in

, + #because it was deprecated in HTML 5 + #accepted id in + #20251217: thn, replaced re.match by re.search + #20251223: thn, accepted hyphen in id. Avoid: [.:] because they are using in CSS + #20251224: check input syntax for id + #Only support in one line + #20260104: treated "div" the same as "span", "h1"..."h6" + #20260104: removed "span", + #20260122: move the function check_input_name into the function clean_html as inner function + #version: 3_5_3_2 (26.01) + + def check_input_name(attribute): + ret_flag=False + name_regex=r'.*=\s*\"(.*)\".*' + matched=re.search(name_regex, attribute) + if matched: + if matched.group(1)[0].isalpha(): + ret_flag=True + return ret_flag + + encoding = "utf-8" message = "" try: with open(input_file, 'r') as fi, open(output_file, 'w') as fo: - fo.write("\n\n\nuntitled\n\n".format(encoding)) body = False skip_tag = False @@ -136,9 +160,11 @@ def clean_html(input_file, output_file): line_out = "" if not body: - init = line_in.find("\n\t\n\t\t\n\t\tuntitled\n\t\n".format(encoding)) + init=0 if body: nb_lines += 1 @@ -196,32 +222,31 @@ def clean_html(input_file, output_file): elif tag[0] in ["span", "p", "h1", "h2", "h3", "h4", "h5", "h6"]: line_out += "<{}".format(tag[0]) - if tag[0] == "p": - # ajouter text-align dans p - line_out += ' style="text-align:justify;"' - if len(tag) > 1: + # 26.01 : ne plus ajouter text-align dans

+ # if tag[0] == "p": + # line_out += ' style="text-align:justify;"' + if len(tag) > 1 and not skip_tag: matched = re.search(id_match_regex, line_in) if matched: - line_out += ' '+ matched.group(1) + if check_input_name(matched.group(1)): + line_out += ' '+ matched.group(1) line_out += '>' - elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", "ol", "/ol", - "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", "u", "/u"]: + elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "/li", "ol", "/ol", + "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", + "u", "/u"]: line_out += "<{}>".format(tag[0]) - - elif tag[0] in ["/span", "/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", "body", "/body", - "/html", "/div", "/img", "/a"]: + elif tag[0] in ["/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", + "body", "/body", "/html", "/div", "/img", "/a"]: line_out += "<{}>".format(tag[0]) - elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]: line_out += "<{}>".format(tag[0]) - - elif tag[0] in ["br", "br/"]: + elif tag[0] in ["br", "br/", "hr"]: line_out += "<{}>".format(tag[0]) if line_out != "": - fo.write(line_out + "\n") - + fo.write(line_out + "\n") + except Exception as e: message = "Error: {}".format(str(e))