added check file encoding must be utf-8

This commit is contained in:
2023-12-04 11:49:04 +01:00
parent e971726578
commit 21e62e47d3
4 changed files with 43 additions and 417 deletions

View File

@@ -1,384 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>untitled</title>
</head>
<body>
<div class=WordSection1>
<h1><b>
Festival des 3 Continents, Nantes 24.11 - 3.12 2023</b></h1>
<h2><b>
Anthologie du cinéma vietnamien</b></h2>
<p align="justify">&nbsp;</p>
<h2>19 films vietnamiens
de 1974 à 2022</h2>
<p align="justify">&nbsp;</p>
<img src="yda_anthologie_cinema_viet_fig_3.jpg" alt="We will meet again" width="510" heigth="500"><br><br>
<p align="justify"><b>We will meet
again</b> / Đến hẹn lại lên - Tran Vu, 1974, 108</p>
<p align="justify">Dimanche 26.11,
18:30 - Vendredi 1.12, 13:45</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Premier amour</b>
/ Mối tình đầu - Hai Ninh, 1977, 112</p>
<p align="justify">Samedi 25.11, 13:45
- Jeudi 30.11, 20:30</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>The faces of may</b>
/ Tháng năm, những gương mặt- Dang Nhat Minh, 1975, 37</p>
<img src="yda_anthologie_cinema_viet_fig_2.jpg" alt="Nostalgie de la campagne" width="510" heigth="500"><br><br>
<p align="justify"><b>Nostalgie de la
campagne</b>&nbsp;/ Thương nhớ đồng quê - Dang Nhat Minh,
1995, 116</p>
<p align="justify">Mercredi 29.11,
20:30 - Vendredi 1.12, 17:00</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Chom et Sa</b>
/ Chom và Sa - Pham Ky Nam, 1979, 70</p>
<p align="justify">Dimanche 26.11,
10:00 - Lundi 27.11, 16:15</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Hanoi through
whose eyes&nbsp;?</b> / Hà Nội trong mắt ai - Tran Van
Thuy, 1982, 45</p>
<p align="justify"><b>The story of kindness
</b>or<b> How to behave</b>&nbsp;/ Chuyện tử tế - Tran Van
Thuy, 1987, 43</p>
<p align="justify">Mardi 28.11, 16:00 -
Dimanche 3.12, 10:30</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Brothers</b>
/ Anh và em - Nguyen Huu Luyen &amp; Tran Vu, 1986, 87</p>
<p align="justify">Dimanche 26.11, 20:45
- Jeudi 30.11, 18:30</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>La fille du
fleuve</b> / Cô gái trên sông - Dang Nhat Minh, 1987, 100</p>
<p align="justify">Samedi 25.11, 20:45
- Jeudi 30.11, 10:15</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Troupe de cirque
ambulant</b> / Gánh xiếc rong - Viet Linh, 1988, 80</p>
<p align="justify">Lundi 27.11, 17:30 -
Dimanche 3.12, 17:30</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Fairytale for a
17-year-old girl</b> /  Chuyện cổ tích cho tuổi 17 -
Nguyen Xuan Son, 1988, 77</p>
<p align="justify">Mercredi 29.11,
13&nbsp;:00 - Samedi 2.12, 13&nbsp;:00</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>La lampe dans le
rêve</b> / Ngọn đèn trong mơ - Do Minh Tuan, 1988, 75</p>
<p align="justify">Mardi 28.11, 10:15 -
Vendredi 1.12, 18:45</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Money,
Money&nbsp;!</b> / Tiền ơi&nbsp;! - Tran Vu &amp; Nguyen
Huu Luyen, 1989, 93</p>
<p align="justify">Dimanche 26.11,
15:30 - Samedi 2.12, 15:45</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Enfance orageuse</b>
/ Tuổi thơ dữ dội - Nguyen Vinh Son, 1990, 135</p>
<p align="justify">Lundi 27.11, 13:30 -
Samedi 2.12, 20:30</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>PLease forgive me</b>
/ Hãy tha thứ cho em - Luu Trong Ninh, 1992, 90</p>
<p align="justify">Lundi 27.11, 18:30 -
Mercredi 29.11, 18:15</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>Piège damour</b>
/ Cạm bảy tình yêu - Pham Loc, 1992, 82</p>
<p align="justify">Lundi 27.11, 10:30 -
Mercredi 29.11, 14:45</p>
<p align="justify">&nbsp;</p>
<p align="justify"><b>In the lane</b>
/ Ngõ hẹp - Bach Diep, 1993, 98</p>
<p align="justify">Samedi 25.11, 10:15
- Mardi 28.11, 14:00</p>
<p align="justify">&nbsp;</p>
<img src="yda_anthologie_cinema_viet_fig_4.jpg" alt="LImmeuble" width="510" heigth="500"><br><br>
<p align="justify"><b>LImmeuble</b>
/ Chung cư - Viet Linh, 1999, 90</p>
<p align="justify">Samedi 25.11, 16:15
- Mercredi 29.11, 15:00</p>
<p align="justify">&nbsp;</p>
<img src="yda_anthologie_cinema_viet_fig_1.jpg" alt="Dust & metal" width="510" heigth="510"><br><br>
<p align="justify"><b>Dust &amp; metal</b>
/ Cát bụi và kim loại - Esther Johnson, 2022, 83</p>
<p align="justify">Vendredi 1.12, 20:30
- Dimanche 3.12, 18:00</p>
<p align="justify">&nbsp;</p>
<p align="justify">Anthologie du cinéma vietnamien&nbsp;:</p>
<p align="justify"><a
href="https://www.3continents.com/fr/programme/2023/anthologie-du-cinema-vietnamien/"
target="_blank">
https://www.3continents.com/fr/programme/2023/anthologie-du-cinema-vietnamien/</a></p>
<p align="justify">Programme 2023, horaires et lieux&nbsp;:</p>
<p align="justify"><a
href="https://www.3continents.com/wp-content/uploads/f3c-prog-2023-40p-net-planche.pdf">https://www.3continents.com/wp-content/uploads/f3c-prog-2023-40p-net-planche.pdf</a></p>
<p align="justify">&nbsp;</p>
<p align="justify">&nbsp;</p>
<p align="justify">&nbsp;</p>
<p align="justify">&nbsp;</p>
</div>
</body>
</html>

View File

@@ -1 +0,0 @@
03-12-2023 10:35 [ Nantes_anthologie_cinema_viet.htm ] *** Cleanup réussi. ***

View File

@@ -10,7 +10,7 @@
<div class="well">
<ul>
<li>Seuls les fichiers au <b>format HTML</b> seront acceptés.</li>
<li>Seuls les fichiers au <b>format HTML et avec un encodage utf-8</b> seront acceptés.</li>
<li>La taille du fichier ne doit pas <b>dépasser 10 Mo</b>.</li>
</ul>
</div>

View File

@@ -6,6 +6,7 @@ import magic
import sys
import datetime
import time
import chardet
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
def home(request):
@@ -40,38 +41,48 @@ def process_file(request, input_file, input_name):
temp_folder = request.registry.settings['temp_folder']
logfile_name = os.path.join(temp_folder, 'errors_log')
# --- controler le mime type
mime = magic.from_buffer(input_file.read(), mime=True)
# types de fichiers autorisés ?
if mime not in EXT_ALLOWED:
message = "Le format du fichier n'est pas valide. Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
return message
# lire la taille du fichier
input_file.seek(0, 2) #seek to end
filesize = input_file.tell()
input_file.seek(0) # back to begining position
# --- controler la taille du fichier
if filesize > MAX_SIZE:
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
return message
# --- controler l'encodage UTF-8
try:
data = input_file.read().decode('utf-8')
except Exception as e:
message = "L'encodage fichier n'est pas valide (non utf-8). Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
return message
# import pdb;pdb.set_trace()
# --- controle presence tag <body>
if data.find('<body') == -1 or data.find('</body>') == -1:
message = "Le format du fichier n'est pas valide (absence de tag <body> ou </body>). Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
return message
input_file.seek(0) # back to begining position
# controle OK, traiter le fichier
output_name = "clean_" + input_name
output_file = os.path.join(temp_folder, output_name)
message = clean_html(input_file, output_file)
if message:
add_error2log(logfile_name, input_name, message)
else:
# lire la taille du fichier
input_file.seek(0, 2) #seek to end
filesize = input_file.tell()
input_file.seek(0) # back to begining position
# controler la taille du fichier
if filesize > MAX_SIZE:
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
else:
# recherche des tag <body>
data = input_file.read().decode('utf-8')
if data.find('<body') == -1 or data.find('</body>') == -1:
message = "Le format du fichier n'est pas valide (absence de tag <body> ou </body>). Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
else:
input_file.seek(0) # back to begining position
# controle OK, traiter le fichier
output_name = "clean_" + input_name
output_file = os.path.join(temp_folder, output_name)
message = clean_html(input_file, output_file)
if message:
add_error2log(logfile_name, input_name, message)
else:
add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***")
add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***")
return message
@@ -79,7 +90,6 @@ def clean_html(input_file, output_file):
# cleanup undesirable tags in html file
encoding = "utf-8"
message = ""
# import pdb;pdb.set_trace()
try:
with open(output_file, 'w') as fo:
@@ -92,6 +102,7 @@ def clean_html(input_file, output_file):
for line_in_bytes in input_file:
line_in = line_in_bytes.decode('utf-8')
line_out = ""
if not body:
init = line_in.find("<body")
if init != -1 and init == 0:
@@ -152,12 +163,12 @@ def clean_html(input_file, output_file):
elif tag[0] in ["p"]:
line_out += "<p align=\"justify\">"
elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li",
"ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub",
"sup", "/sup", "u", "/u"]:
"ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub",
"sup", "/sup", "u", "/u"]:
line_out += "<{}>".format(tag[0])
elif tag[0] in ["h1", "/h1", "h2", "/h2", "h3", "/h3", "h4", "/h4", "h5",
"/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div",
"/img", "/a"]:
"/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div",
"/img", "/a"]:
line_out += "<{}>".format(tag[0])
elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]:
line_out += "<{}>".format(tag[0])