download.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. """Download files with progress indicators.
  2. """
  3. import email.message
  4. import logging
  5. import mimetypes
  6. import os
  7. from typing import Iterable, Optional, Tuple
  8. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
  9. from pip._internal.cli.progress_bars import get_download_progress_renderer
  10. from pip._internal.exceptions import NetworkConnectionError
  11. from pip._internal.models.index import PyPI
  12. from pip._internal.models.link import Link
  13. from pip._internal.network.cache import is_from_cache
  14. from pip._internal.network.session import PipSession
  15. from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
  16. from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
  17. logger = logging.getLogger(__name__)
  18. def _get_http_response_size(resp: Response) -> Optional[int]:
  19. try:
  20. return int(resp.headers["content-length"])
  21. except (ValueError, KeyError, TypeError):
  22. return None
  23. def _prepare_download(
  24. resp: Response,
  25. link: Link,
  26. progress_bar: str,
  27. ) -> Iterable[bytes]:
  28. total_length = _get_http_response_size(resp)
  29. if link.netloc == PyPI.file_storage_domain:
  30. url = link.show_url
  31. else:
  32. url = link.url_without_fragment
  33. logged_url = redact_auth_from_url(url)
  34. if total_length:
  35. logged_url = "{} ({})".format(logged_url, format_size(total_length))
  36. if is_from_cache(resp):
  37. logger.info("Using cached %s", logged_url)
  38. else:
  39. logger.info("Downloading %s", logged_url)
  40. if logger.getEffectiveLevel() > logging.INFO:
  41. show_progress = False
  42. elif is_from_cache(resp):
  43. show_progress = False
  44. elif not total_length:
  45. show_progress = True
  46. elif total_length > (40 * 1000):
  47. show_progress = True
  48. else:
  49. show_progress = False
  50. chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
  51. if not show_progress:
  52. return chunks
  53. renderer = get_download_progress_renderer(bar_type=progress_bar, size=total_length)
  54. return renderer(chunks)
  55. def sanitize_content_filename(filename: str) -> str:
  56. """
  57. Sanitize the "filename" value from a Content-Disposition header.
  58. """
  59. return os.path.basename(filename)
  60. def parse_content_disposition(content_disposition: str, default_filename: str) -> str:
  61. """
  62. Parse the "filename" value from a Content-Disposition header, and
  63. return the default filename if the result is empty.
  64. """
  65. m = email.message.Message()
  66. m["content-type"] = content_disposition
  67. filename = m.get_param("filename")
  68. if filename:
  69. # We need to sanitize the filename to prevent directory traversal
  70. # in case the filename contains ".." path parts.
  71. filename = sanitize_content_filename(str(filename))
  72. return filename or default_filename
  73. def _get_http_response_filename(resp: Response, link: Link) -> str:
  74. """Get an ideal filename from the given HTTP response, falling back to
  75. the link filename if not provided.
  76. """
  77. filename = link.filename # fallback
  78. # Have a look at the Content-Disposition header for a better guess
  79. content_disposition = resp.headers.get("content-disposition")
  80. if content_disposition:
  81. filename = parse_content_disposition(content_disposition, filename)
  82. ext: Optional[str] = splitext(filename)[1]
  83. if not ext:
  84. ext = mimetypes.guess_extension(resp.headers.get("content-type", ""))
  85. if ext:
  86. filename += ext
  87. if not ext and link.url != resp.url:
  88. ext = os.path.splitext(resp.url)[1]
  89. if ext:
  90. filename += ext
  91. return filename
  92. def _http_get_download(session: PipSession, link: Link) -> Response:
  93. target_url = link.url.split("#", 1)[0]
  94. resp = session.get(target_url, headers=HEADERS, stream=True)
  95. raise_for_status(resp)
  96. return resp
  97. class Downloader:
  98. def __init__(
  99. self,
  100. session: PipSession,
  101. progress_bar: str,
  102. ) -> None:
  103. self._session = session
  104. self._progress_bar = progress_bar
  105. def __call__(self, link: Link, location: str) -> Tuple[str, str]:
  106. """Download the file given by link into location."""
  107. try:
  108. resp = _http_get_download(self._session, link)
  109. except NetworkConnectionError as e:
  110. assert e.response is not None
  111. logger.critical(
  112. "HTTP error %s while getting %s", e.response.status_code, link
  113. )
  114. raise
  115. filename = _get_http_response_filename(resp, link)
  116. filepath = os.path.join(location, filename)
  117. chunks = _prepare_download(resp, link, self._progress_bar)
  118. with open(filepath, "wb") as content_file:
  119. for chunk in chunks:
  120. content_file.write(chunk)
  121. content_type = resp.headers.get("Content-Type", "")
  122. return filepath, content_type
  123. class BatchDownloader:
  124. def __init__(
  125. self,
  126. session: PipSession,
  127. progress_bar: str,
  128. ) -> None:
  129. self._session = session
  130. self._progress_bar = progress_bar
  131. def __call__(
  132. self, links: Iterable[Link], location: str
  133. ) -> Iterable[Tuple[Link, Tuple[str, str]]]:
  134. """Download the files given by links into location."""
  135. for link in links:
  136. try:
  137. resp = _http_get_download(self._session, link)
  138. except NetworkConnectionError as e:
  139. assert e.response is not None
  140. logger.critical(
  141. "HTTP error %s while getting %s",
  142. e.response.status_code,
  143. link,
  144. )
  145. raise
  146. filename = _get_http_response_filename(resp, link)
  147. filepath = os.path.join(location, filename)
  148. chunks = _prepare_download(resp, link, self._progress_bar)
  149. with open(filepath, "wb") as content_file:
  150. for chunk in chunks:
  151. content_file.write(chunk)
  152. content_type = resp.headers.get("Content-Type", "")
  153. yield link, (filepath, content_type)