From 21e62e47d36c2a3022bba57c49475a8e239d8f4b Mon Sep 17 00:00:00 2001 From: Phuoc Cao Date: Mon, 4 Dec 2023 11:49:04 +0100 Subject: [PATCH] added check file encoding must be utf-8 --- .../clean_Nantes_anthologie_cinema_viet.htm | 384 ------------------ html_cleanup/static/temp/errors_log | 1 - html_cleanup/templates/home.jinja2 | 2 +- html_cleanup/views/default.py | 73 ++-- 4 files changed, 43 insertions(+), 417 deletions(-) delete mode 100644 html_cleanup/static/temp/clean_Nantes_anthologie_cinema_viet.htm delete mode 100644 html_cleanup/static/temp/errors_log diff --git a/html_cleanup/static/temp/clean_Nantes_anthologie_cinema_viet.htm b/html_cleanup/static/temp/clean_Nantes_anthologie_cinema_viet.htm deleted file mode 100644 index 2e5f859..0000000 --- a/html_cleanup/static/temp/clean_Nantes_anthologie_cinema_viet.htm +++ /dev/null @@ -1,384 +0,0 @@ - - - -untitled - - - - - -
- - - -

-Festival des 3 Continents, Nantes 24.11 - 3.12 2023

- - - -

-Anthologie du cinéma vietnamien

- - - -

 

- - - -

19 films vietnamiens - -de 1974 à 2022

- - - -

 

- - - -We will meet again

- - - -

We will meet - -again / Đến hẹn lại lên - Tran Vu, 1974, 108’

- - - -

Dimanche 26.11, - -18:30 - Vendredi 1.12, 13:45

- - - -

 

- - - -

Premier amour - / Mối tình đầu - Hai Ninh, 1977, 112’

- - - -

Samedi 25.11, 13:45 - -- Jeudi 30.11, 20:30

- - - -

 

- - - -

The faces of may - / Tháng năm, những gương mặt- Dang Nhat Minh, 1975, 37’

- - - -Nostalgie de la campagne

- - - -

Nostalgie de la - -campagne / Thương nhớ đồng quê - Dang Nhat Minh, - -1995, 116’

- - - -

Mercredi 29.11, - -20:30 - Vendredi 1.12, 17:00

- - - -

 

- - - -

Chom et Sa - / Chom và Sa - Pham Ky Nam, 1979, 70’

- - - -

Dimanche 26.11, - -10:00 - Lundi 27.11, 16:15

- - - -

 

- - - -

Hanoi through - -whose eyes ? / Hà Nội trong mắt ai - Tran Van - -Thuy, 1982, 45’

- - - -

The story of kindness - -or How to behave / Chuyện tử tế - Tran Van - -Thuy, 1987, 43’

- - - -

Mardi 28.11, 16:00 - - -Dimanche 3.12, 10:30

- - - -

 

- - - -

Brothers - / Anh và em - Nguyen Huu Luyen & Tran Vu, 1986, 87’

- - - -

Dimanche 26.11, 20:45 - -- Jeudi 30.11, 18:30

- - - -

 

- - - -

La fille du - -fleuve / Cô gái trên sông - Dang Nhat Minh, 1987, 100’

- - - -

Samedi 25.11, 20:45 - -- Jeudi 30.11, 10:15

- - - -

 

- - - -

Troupe de cirque - -ambulant / Gánh xiếc rong - Viet Linh, 1988, 80’

- - - -

Lundi 27.11, 17:30 - - -Dimanche 3.12, 17:30

- - - -

 

- - - -

Fairytale for a - -17-year-old girl /  Chuyện cổ tích cho tuổi 17 - - -Nguyen Xuan Son, 1988, 77’

- - - -

Mercredi 29.11, - -13 :00 - Samedi 2.12, 13 :00

- - - -

 

- - - -

La lampe dans le - -rêve / Ngọn đèn trong mơ - Do Minh Tuan, 1988, 75’

- - - -

Mardi 28.11, 10:15 - - -Vendredi 1.12, 18:45

- - - -

 

- - - -

Money, - -Money ! / Tiền ơi ! - Tran Vu & Nguyen - -Huu Luyen, 1989, 93’

- - - -

Dimanche 26.11, - -15:30 - Samedi 2.12, 15:45

- - - -

 

- - - -

Enfance orageuse - / Tuổi thơ dữ dội - Nguyen Vinh Son, 1990, 135’

- - - -

Lundi 27.11, 13:30 - - -Samedi 2.12, 20:30

- - - -

 

- - - -

PLease forgive me - / Hãy tha thứ cho em - Luu Trong Ninh, 1992, 90’

- - - -

Lundi 27.11, 18:30 - - -Mercredi 29.11, 18:15

- - - -

 

- - - -

Piège d’amour - / Cạm bảy tình yêu - Pham Loc, 1992, 82’

- - - -

Lundi 27.11, 10:30 - - -Mercredi 29.11, 14:45

- - - -

 

- - - -

In the lane - / Ngõ hẹp - Bach Diep, 1993, 98’

- - - -

Samedi 25.11, 10:15 - -- Mardi 28.11, 14:00

- - - -

 

- - - -L’Immeuble

- - - -

L’Immeuble - / Chung cư - Viet Linh, 1999, 90’

- - - -

Samedi 25.11, 16:15 - -- Mercredi 29.11, 15:00

- - - -

 

- - - -Dust & metal

- -

Dust & metal - / Cát bụi và kim loại - Esther Johnson, 2022, 83’

- - - -

Vendredi 1.12, 20:30 - -- Dimanche 3.12, 18:00

- - - -

 

- - - -

Anthologie du cinéma vietnamien :

- - - -

-https://www.3continents.com/fr/programme/2023/anthologie-du-cinema-vietnamien/

- - - -

Programme 2023, horaires et lieux :

- - - -

https://www.3continents.com/wp-content/uploads/f3c-prog-2023-40p-net-planche.pdf

- - - -

 

- - - -

 

- - - -

 

- - - -

 

- - - -
- - - - - - - - - diff --git a/html_cleanup/static/temp/errors_log b/html_cleanup/static/temp/errors_log deleted file mode 100644 index a1174b9..0000000 --- a/html_cleanup/static/temp/errors_log +++ /dev/null @@ -1 +0,0 @@ -03-12-2023 10:35 [ Nantes_anthologie_cinema_viet.htm ] *** Cleanup réussi. *** diff --git a/html_cleanup/templates/home.jinja2 b/html_cleanup/templates/home.jinja2 index 51c022b..76c158a 100644 --- a/html_cleanup/templates/home.jinja2 +++ b/html_cleanup/templates/home.jinja2 @@ -10,7 +10,7 @@
diff --git a/html_cleanup/views/default.py b/html_cleanup/views/default.py index ea992c7..717aeb0 100644 --- a/html_cleanup/views/default.py +++ b/html_cleanup/views/default.py @@ -6,6 +6,7 @@ import magic import sys import datetime import time +import chardet @view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2') def home(request): @@ -40,38 +41,48 @@ def process_file(request, input_file, input_name): temp_folder = request.registry.settings['temp_folder'] logfile_name = os.path.join(temp_folder, 'errors_log') + # --- controler le mime type mime = magic.from_buffer(input_file.read(), mime=True) # types de fichiers autorisés ? if mime not in EXT_ALLOWED: message = "Le format du fichier n'est pas valide. Téléchargement refusé." add_error2log(logfile_name, input_name, message) + return message + + # lire la taille du fichier + input_file.seek(0, 2) #seek to end + filesize = input_file.tell() + input_file.seek(0) # back to begining position + # --- controler la taille du fichier + if filesize > MAX_SIZE: + message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé." + add_error2log(logfile_name, input_name, message) + return message + + # --- controler l'encodage UTF-8 + try: + data = input_file.read().decode('utf-8') + except Exception as e: + message = "L'encodage fichier n'est pas valide (non utf-8). Téléchargement refusé." + add_error2log(logfile_name, input_name, message) + return message + + # import pdb;pdb.set_trace() + # --- controle presence tag + if data.find('') == -1: + message = "Le format du fichier n'est pas valide (absence de tag ou ). Téléchargement refusé." + add_error2log(logfile_name, input_name, message) + return message + + input_file.seek(0) # back to begining position + # controle OK, traiter le fichier + output_name = "clean_" + input_name + output_file = os.path.join(temp_folder, output_name) + message = clean_html(input_file, output_file) + if message: + add_error2log(logfile_name, input_name, message) else: - # lire la taille du fichier - input_file.seek(0, 2) #seek to end - filesize = input_file.tell() - input_file.seek(0) # back to begining position - # controler la taille du fichier - if filesize > MAX_SIZE: - message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé." - add_error2log(logfile_name, input_name, message) - else: - # recherche des tag - data = input_file.read().decode('utf-8') - if data.find('') == -1: - message = "Le format du fichier n'est pas valide (absence de tag ou ). Téléchargement refusé." - add_error2log(logfile_name, input_name, message) - else: - input_file.seek(0) # back to begining position - # controle OK, traiter le fichier - output_name = "clean_" + input_name - output_file = os.path.join(temp_folder, output_name) - message = clean_html(input_file, output_file) - if message: - add_error2log(logfile_name, input_name, message) - else: - add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***") - - + add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***") return message @@ -79,7 +90,6 @@ def clean_html(input_file, output_file): # cleanup undesirable tags in html file encoding = "utf-8" message = "" - # import pdb;pdb.set_trace() try: with open(output_file, 'w') as fo: @@ -92,6 +102,7 @@ def clean_html(input_file, output_file): for line_in_bytes in input_file: line_in = line_in_bytes.decode('utf-8') line_out = "" + if not body: init = line_in.find("" elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", - "ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub", - "sup", "/sup", "u", "/u"]: + "ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub", + "sup", "/sup", "u", "/u"]: line_out += "<{}>".format(tag[0]) elif tag[0] in ["h1", "/h1", "h2", "/h2", "h3", "/h3", "h4", "/h4", "h5", - "/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div", - "/img", "/a"]: + "/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div", + "/img", "/a"]: line_out += "<{}>".format(tag[0]) elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]: line_out += "<{}>".format(tag[0])