Skip to content

Commit d1b4b2b

Browse files
author
Artiom N.
committed
Build fix
1 parent d245977 commit d1b4b2b

File tree

3 files changed

+57
-67
lines changed

3 files changed

+57
-67
lines changed

markdown_toolset/image_downloader.py

+56-34
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def __repr__(self):
6161
class ImageDownloader:
6262
""" "Smart" images downloader."""
6363

64-
def __init__(
64+
def __init__( # pylint: disable=too-many-arguments
6565
self,
6666
out_path_maker: OutPathMaker,
6767
skip_list: Optional[List[str]] = None,
@@ -92,7 +92,7 @@ def __init__(
9292
self._running = False
9393
self._replace_image_names = replace_image_names
9494

95-
# pylint: disable=R0912(too-many-branches)
95+
# pylint: disable=R0912(too-many-branches),too-many-arguments
9696
def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
9797
"""
9898
Download and save images from the list.
@@ -101,7 +101,6 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
101101
"""
102102

103103
replacement_mapping: Dict[str, str] = {}
104-
105104
images_count = len(images)
106105

107106
# TODO: Refactor this.
@@ -116,20 +115,10 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
116115

117116
assert image_url not in replacement_mapping, f'BUG: already downloaded image "{image_url}"...'
118117

119-
if self._need_to_skip_url(image_url):
120-
logging.debug('Image %d downloading was skipped...', image_num + 1)
121-
continue
122-
123-
if not is_url(image_url):
124-
logging.warning('Image %d ["%s"] probably has incorrect URL...', image_num + 1, image_url)
118+
image_download_url = self._get_image_download_url(image_url, image_num)
125119

126-
if self._out_path_maker.article_base_url:
127-
logging.debug('Trying to add base URL "%s"...', self._out_path_maker.article_base_url)
128-
image_download_url = f'{self._out_path_maker.article_base_url}/{image_url}'
129-
else:
130-
image_download_url = str(Path(self._out_path_maker.article_file_path).parent / image_url)
131-
else:
132-
image_download_url = image_url
120+
if image_download_url is None:
121+
continue
133122

134123
try:
135124
mime_type, _ = mimetypes.guess_type(image_download_url)
@@ -158,10 +147,7 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
158147
continue
159148

160149
if self._replace_image_names:
161-
_, image_ext = split_file_ext(image_filename)
162-
image_content_hash = hashlib.sha384(image_content).hexdigest()
163-
logging.debug('Image content hash: %s', image_filename)
164-
image_filename = f'{image_content_hash}.{image_ext}'
150+
image_filename = self._get_hashed_image_name(image_filename, image_content)
165151

166152
except Exception as e:
167153
if self._skip_all_errors:
@@ -185,19 +171,9 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
185171
image_local_url, real_image_path = self._get_real_path(image_url, image_filename)
186172

187173
if self._replace_image_names and real_image_path.exists():
188-
# Image by this content hash exists, but possibly this is a collision.
189-
with open(real_image_path, 'rb') as real_file:
190-
if not is_binary_same(real_file, BytesIO(image_content)):
191-
# Fix collision, changing name.
192-
img_num: int = 0
193-
while real_image_path.exists():
194-
numerated_image_filename = f'{image_num}{image_filename}'
195-
real_image_path = self._out_path_maker.get_real_path(
196-
image_local_url, numerated_image_filename
197-
)
198-
img_num += 1
199-
200-
image_filename = numerated_image_filename
174+
image_local_url, real_image_path, image_filename = self._fix_name_collision(
175+
image_url, image_filename, image_content
176+
)
201177

202178
self._update_mapping(image_url, image_local_url, image_filename, replacement_mapping)
203179
self._write_image(real_image_path, image_content, image_link)
@@ -215,6 +191,24 @@ def stop(self):
215191
logging.info('Images downloading stopped.')
216192
self._running = False
217193

194+
def _get_image_download_url(self, image_url: str, image_num: int) -> Optional[str]:
195+
if self._need_to_skip_url(image_url):
196+
logging.debug('Image %d downloading was skipped...', image_num + 1)
197+
return None
198+
199+
if not is_url(image_url):
200+
logging.warning('Image %d ["%s"] probably has incorrect URL...', image_num + 1, image_url)
201+
202+
if self._out_path_maker.article_base_url:
203+
logging.debug('Trying to add base URL "%s"...', self._out_path_maker.article_base_url)
204+
image_download_url = f'{self._out_path_maker.article_base_url}/{image_url}'
205+
else:
206+
image_download_url = str(Path(self._out_path_maker.article_file_path).parent / image_url)
207+
else:
208+
image_download_url = image_url
209+
210+
return image_download_url
211+
218212
@staticmethod
219213
def _resize_image(image_content: bytes, new_size, filename):
220214
img = Image.open(BytesIO(image_content))
@@ -308,7 +302,6 @@ def _write_image(self, image_path: Path, data: bytes, image_link: Union[ImageLin
308302

309303
def _fix_paths(self, replacement_mapping, document_img_path, img_url, image_filename):
310304
"""Fix path if a file with the similar name exists already."""
311-
312305
# Images can have similar name, but different URLs, but I want to save original filename, if possible.
313306
for url, path in replacement_mapping.items():
314307
if document_img_path == path and img_url != url:
@@ -317,3 +310,32 @@ def _fix_paths(self, replacement_mapping, document_img_path, img_url, image_file
317310
break
318311

319312
return image_filename, document_img_path
313+
314+
def _fix_name_collision(self, image_url, image_filename, image_content):
315+
"""Fix possibly collision between file names"""
316+
image_local_url, real_image_path = self._get_real_path(image_url, image_filename)
317+
318+
with open(real_image_path, 'rb') as real_file:
319+
if not is_binary_same(real_file, BytesIO(image_content)):
320+
# Fix collision, changing name.
321+
img_num: int = 0
322+
while real_image_path.exists():
323+
numerated_image_filename = f'{img_num}{image_filename}'
324+
real_image_path = self._out_path_maker.get_real_path(image_local_url, numerated_image_filename)
325+
img_num += 1
326+
327+
image_filename = numerated_image_filename
328+
329+
return *self._get_real_path(image_url, image_filename), image_filename
330+
331+
return image_url, real_image_path, image_filename
332+
333+
@staticmethod
334+
def _get_hashed_image_name(image_filename, image_content) -> str:
335+
"""
336+
Get filename from the image content.
337+
"""
338+
_, image_ext = split_file_ext(image_filename)
339+
image_content_hash = hashlib.sha256(image_content).hexdigest()
340+
logging.debug('Image content hash: %s', image_filename)
341+
return f'{image_content_hash}.{image_ext}'

tests/test_article_downloader.py

-32
This file was deleted.

tests/test_image_downloader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_names_replacing(self):
8282
)
8383

8484
with open(self._article_images_path / self._image_filename, 'rb') as image_file:
85-
image_hash = hashlib.sha384(image_file.read()).hexdigest()
85+
image_hash = hashlib.sha256(image_file.read()).hexdigest()
8686

8787
image_downloader.download_images([self._image_in_relpath])
8888

0 commit comments

Comments
 (0)