@@ -61,7 +61,7 @@ def __repr__(self):
61
61
class ImageDownloader :
62
62
""" "Smart" images downloader."""
63
63
64
- def __init__ (
64
+ def __init__ ( # pylint: disable=too-many-arguments
65
65
self ,
66
66
out_path_maker : OutPathMaker ,
67
67
skip_list : Optional [List [str ]] = None ,
@@ -92,7 +92,7 @@ def __init__(
92
92
self ._running = False
93
93
self ._replace_image_names = replace_image_names
94
94
95
- # pylint: disable=R0912(too-many-branches)
95
+ # pylint: disable=R0912(too-many-branches),too-many-arguments
96
96
def download_images (self , images : List [Union [str , ImageLink ]]) -> dict :
97
97
"""
98
98
Download and save images from the list.
@@ -101,7 +101,6 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
101
101
"""
102
102
103
103
replacement_mapping : Dict [str , str ] = {}
104
-
105
104
images_count = len (images )
106
105
107
106
# TODO: Refactor this.
@@ -116,20 +115,10 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
116
115
117
116
assert image_url not in replacement_mapping , f'BUG: already downloaded image "{ image_url } "...'
118
117
119
- if self ._need_to_skip_url (image_url ):
120
- logging .debug ('Image %d downloading was skipped...' , image_num + 1 )
121
- continue
122
-
123
- if not is_url (image_url ):
124
- logging .warning ('Image %d ["%s"] probably has incorrect URL...' , image_num + 1 , image_url )
118
+ image_download_url = self ._get_image_download_url (image_url , image_num )
125
119
126
- if self ._out_path_maker .article_base_url :
127
- logging .debug ('Trying to add base URL "%s"...' , self ._out_path_maker .article_base_url )
128
- image_download_url = f'{ self ._out_path_maker .article_base_url } /{ image_url } '
129
- else :
130
- image_download_url = str (Path (self ._out_path_maker .article_file_path ).parent / image_url )
131
- else :
132
- image_download_url = image_url
120
+ if image_download_url is None :
121
+ continue
133
122
134
123
try :
135
124
mime_type , _ = mimetypes .guess_type (image_download_url )
@@ -158,10 +147,7 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
158
147
continue
159
148
160
149
if self ._replace_image_names :
161
- _ , image_ext = split_file_ext (image_filename )
162
- image_content_hash = hashlib .sha384 (image_content ).hexdigest ()
163
- logging .debug ('Image content hash: %s' , image_filename )
164
- image_filename = f'{ image_content_hash } .{ image_ext } '
150
+ image_filename = self ._get_hashed_image_name (image_filename , image_content )
165
151
166
152
except Exception as e :
167
153
if self ._skip_all_errors :
@@ -185,19 +171,9 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
185
171
image_local_url , real_image_path = self ._get_real_path (image_url , image_filename )
186
172
187
173
if self ._replace_image_names and real_image_path .exists ():
188
- # Image by this content hash exists, but possibly this is a collision.
189
- with open (real_image_path , 'rb' ) as real_file :
190
- if not is_binary_same (real_file , BytesIO (image_content )):
191
- # Fix collision, changing name.
192
- img_num : int = 0
193
- while real_image_path .exists ():
194
- numerated_image_filename = f'{ image_num } { image_filename } '
195
- real_image_path = self ._out_path_maker .get_real_path (
196
- image_local_url , numerated_image_filename
197
- )
198
- img_num += 1
199
-
200
- image_filename = numerated_image_filename
174
+ image_local_url , real_image_path , image_filename = self ._fix_name_collision (
175
+ image_url , image_filename , image_content
176
+ )
201
177
202
178
self ._update_mapping (image_url , image_local_url , image_filename , replacement_mapping )
203
179
self ._write_image (real_image_path , image_content , image_link )
@@ -215,6 +191,24 @@ def stop(self):
215
191
logging .info ('Images downloading stopped.' )
216
192
self ._running = False
217
193
194
+ def _get_image_download_url (self , image_url : str , image_num : int ) -> Optional [str ]:
195
+ if self ._need_to_skip_url (image_url ):
196
+ logging .debug ('Image %d downloading was skipped...' , image_num + 1 )
197
+ return None
198
+
199
+ if not is_url (image_url ):
200
+ logging .warning ('Image %d ["%s"] probably has incorrect URL...' , image_num + 1 , image_url )
201
+
202
+ if self ._out_path_maker .article_base_url :
203
+ logging .debug ('Trying to add base URL "%s"...' , self ._out_path_maker .article_base_url )
204
+ image_download_url = f'{ self ._out_path_maker .article_base_url } /{ image_url } '
205
+ else :
206
+ image_download_url = str (Path (self ._out_path_maker .article_file_path ).parent / image_url )
207
+ else :
208
+ image_download_url = image_url
209
+
210
+ return image_download_url
211
+
218
212
@staticmethod
219
213
def _resize_image (image_content : bytes , new_size , filename ):
220
214
img = Image .open (BytesIO (image_content ))
@@ -308,7 +302,6 @@ def _write_image(self, image_path: Path, data: bytes, image_link: Union[ImageLin
308
302
309
303
def _fix_paths (self , replacement_mapping , document_img_path , img_url , image_filename ):
310
304
"""Fix path if a file with the similar name exists already."""
311
-
312
305
# Images can have similar name, but different URLs, but I want to save original filename, if possible.
313
306
for url , path in replacement_mapping .items ():
314
307
if document_img_path == path and img_url != url :
@@ -317,3 +310,32 @@ def _fix_paths(self, replacement_mapping, document_img_path, img_url, image_file
317
310
break
318
311
319
312
return image_filename , document_img_path
313
+
314
+ def _fix_name_collision (self , image_url , image_filename , image_content ):
315
+ """Fix possibly collision between file names"""
316
+ image_local_url , real_image_path = self ._get_real_path (image_url , image_filename )
317
+
318
+ with open (real_image_path , 'rb' ) as real_file :
319
+ if not is_binary_same (real_file , BytesIO (image_content )):
320
+ # Fix collision, changing name.
321
+ img_num : int = 0
322
+ while real_image_path .exists ():
323
+ numerated_image_filename = f'{ img_num } { image_filename } '
324
+ real_image_path = self ._out_path_maker .get_real_path (image_local_url , numerated_image_filename )
325
+ img_num += 1
326
+
327
+ image_filename = numerated_image_filename
328
+
329
+ return * self ._get_real_path (image_url , image_filename ), image_filename
330
+
331
+ return image_url , real_image_path , image_filename
332
+
333
+ @staticmethod
334
+ def _get_hashed_image_name (image_filename , image_content ) -> str :
335
+ """
336
+ Get filename from the image content.
337
+ """
338
+ _ , image_ext = split_file_ext (image_filename )
339
+ image_content_hash = hashlib .sha256 (image_content ).hexdigest ()
340
+ logging .debug ('Image content hash: %s' , image_filename )
341
+ return f'{ image_content_hash } .{ image_ext } '
0 commit comments