cache.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. """Cache Management
  2. """
  3. import hashlib
  4. import json
  5. import logging
  6. import os
  7. from pathlib import Path
  8. from typing import Any, Dict, List, Optional
  9. from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
  10. from pip._vendor.packaging.utils import canonicalize_name
  11. from pip._internal.exceptions import InvalidWheelFilename
  12. from pip._internal.models.direct_url import DirectUrl
  13. from pip._internal.models.link import Link
  14. from pip._internal.models.wheel import Wheel
  15. from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
  16. from pip._internal.utils.urls import path_to_url
  17. logger = logging.getLogger(__name__)
  18. ORIGIN_JSON_NAME = "origin.json"
  19. def _hash_dict(d: Dict[str, str]) -> str:
  20. """Return a stable sha224 of a dictionary."""
  21. s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
  22. return hashlib.sha224(s.encode("ascii")).hexdigest()
  23. class Cache:
  24. """An abstract class - provides cache directories for data from links
  25. :param cache_dir: The root of the cache.
  26. """
  27. def __init__(self, cache_dir: str) -> None:
  28. super().__init__()
  29. assert not cache_dir or os.path.isabs(cache_dir)
  30. self.cache_dir = cache_dir or None
  31. def _get_cache_path_parts(self, link: Link) -> List[str]:
  32. """Get parts of part that must be os.path.joined with cache_dir"""
  33. # We want to generate an url to use as our cache key, we don't want to
  34. # just re-use the URL because it might have other items in the fragment
  35. # and we don't care about those.
  36. key_parts = {"url": link.url_without_fragment}
  37. if link.hash_name is not None and link.hash is not None:
  38. key_parts[link.hash_name] = link.hash
  39. if link.subdirectory_fragment:
  40. key_parts["subdirectory"] = link.subdirectory_fragment
  41. # Include interpreter name, major and minor version in cache key
  42. # to cope with ill-behaved sdists that build a different wheel
  43. # depending on the python version their setup.py is being run on,
  44. # and don't encode the difference in compatibility tags.
  45. # https://github.com/pypa/pip/issues/7296
  46. key_parts["interpreter_name"] = interpreter_name()
  47. key_parts["interpreter_version"] = interpreter_version()
  48. # Encode our key url with sha224, we'll use this because it has similar
  49. # security properties to sha256, but with a shorter total output (and
  50. # thus less secure). However the differences don't make a lot of
  51. # difference for our use case here.
  52. hashed = _hash_dict(key_parts)
  53. # We want to nest the directories some to prevent having a ton of top
  54. # level directories where we might run out of sub directories on some
  55. # FS.
  56. parts = [hashed[:2], hashed[2:4], hashed[4:6], hashed[6:]]
  57. return parts
  58. def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
  59. can_not_cache = not self.cache_dir or not canonical_package_name or not link
  60. if can_not_cache:
  61. return []
  62. candidates = []
  63. path = self.get_path_for_link(link)
  64. if os.path.isdir(path):
  65. for candidate in os.listdir(path):
  66. candidates.append((candidate, path))
  67. return candidates
  68. def get_path_for_link(self, link: Link) -> str:
  69. """Return a directory to store cached items in for link."""
  70. raise NotImplementedError()
  71. def get(
  72. self,
  73. link: Link,
  74. package_name: Optional[str],
  75. supported_tags: List[Tag],
  76. ) -> Link:
  77. """Returns a link to a cached item if it exists, otherwise returns the
  78. passed link.
  79. """
  80. raise NotImplementedError()
  81. class SimpleWheelCache(Cache):
  82. """A cache of wheels for future installs."""
  83. def __init__(self, cache_dir: str) -> None:
  84. super().__init__(cache_dir)
  85. def get_path_for_link(self, link: Link) -> str:
  86. """Return a directory to store cached wheels for link
  87. Because there are M wheels for any one sdist, we provide a directory
  88. to cache them in, and then consult that directory when looking up
  89. cache hits.
  90. We only insert things into the cache if they have plausible version
  91. numbers, so that we don't contaminate the cache with things that were
  92. not unique. E.g. ./package might have dozens of installs done for it
  93. and build a version of 0.0...and if we built and cached a wheel, we'd
  94. end up using the same wheel even if the source has been edited.
  95. :param link: The link of the sdist for which this will cache wheels.
  96. """
  97. parts = self._get_cache_path_parts(link)
  98. assert self.cache_dir
  99. # Store wheels within the root cache_dir
  100. return os.path.join(self.cache_dir, "wheels", *parts)
  101. def get(
  102. self,
  103. link: Link,
  104. package_name: Optional[str],
  105. supported_tags: List[Tag],
  106. ) -> Link:
  107. candidates = []
  108. if not package_name:
  109. return link
  110. canonical_package_name = canonicalize_name(package_name)
  111. for wheel_name, wheel_dir in self._get_candidates(link, canonical_package_name):
  112. try:
  113. wheel = Wheel(wheel_name)
  114. except InvalidWheelFilename:
  115. continue
  116. if canonicalize_name(wheel.name) != canonical_package_name:
  117. logger.debug(
  118. "Ignoring cached wheel %s for %s as it "
  119. "does not match the expected distribution name %s.",
  120. wheel_name,
  121. link,
  122. package_name,
  123. )
  124. continue
  125. if not wheel.supported(supported_tags):
  126. # Built for a different python/arch/etc
  127. continue
  128. candidates.append(
  129. (
  130. wheel.support_index_min(supported_tags),
  131. wheel_name,
  132. wheel_dir,
  133. )
  134. )
  135. if not candidates:
  136. return link
  137. _, wheel_name, wheel_dir = min(candidates)
  138. return Link(path_to_url(os.path.join(wheel_dir, wheel_name)))
  139. class EphemWheelCache(SimpleWheelCache):
  140. """A SimpleWheelCache that creates it's own temporary cache directory"""
  141. def __init__(self) -> None:
  142. self._temp_dir = TempDirectory(
  143. kind=tempdir_kinds.EPHEM_WHEEL_CACHE,
  144. globally_managed=True,
  145. )
  146. super().__init__(self._temp_dir.path)
  147. class CacheEntry:
  148. def __init__(
  149. self,
  150. link: Link,
  151. persistent: bool,
  152. ):
  153. self.link = link
  154. self.persistent = persistent
  155. self.origin: Optional[DirectUrl] = None
  156. origin_direct_url_path = Path(self.link.file_path).parent / ORIGIN_JSON_NAME
  157. if origin_direct_url_path.exists():
  158. self.origin = DirectUrl.from_json(origin_direct_url_path.read_text())
  159. class WheelCache(Cache):
  160. """Wraps EphemWheelCache and SimpleWheelCache into a single Cache
  161. This Cache allows for gracefully degradation, using the ephem wheel cache
  162. when a certain link is not found in the simple wheel cache first.
  163. """
  164. def __init__(self, cache_dir: str) -> None:
  165. super().__init__(cache_dir)
  166. self._wheel_cache = SimpleWheelCache(cache_dir)
  167. self._ephem_cache = EphemWheelCache()
  168. def get_path_for_link(self, link: Link) -> str:
  169. return self._wheel_cache.get_path_for_link(link)
  170. def get_ephem_path_for_link(self, link: Link) -> str:
  171. return self._ephem_cache.get_path_for_link(link)
  172. def get(
  173. self,
  174. link: Link,
  175. package_name: Optional[str],
  176. supported_tags: List[Tag],
  177. ) -> Link:
  178. cache_entry = self.get_cache_entry(link, package_name, supported_tags)
  179. if cache_entry is None:
  180. return link
  181. return cache_entry.link
  182. def get_cache_entry(
  183. self,
  184. link: Link,
  185. package_name: Optional[str],
  186. supported_tags: List[Tag],
  187. ) -> Optional[CacheEntry]:
  188. """Returns a CacheEntry with a link to a cached item if it exists or
  189. None. The cache entry indicates if the item was found in the persistent
  190. or ephemeral cache.
  191. """
  192. retval = self._wheel_cache.get(
  193. link=link,
  194. package_name=package_name,
  195. supported_tags=supported_tags,
  196. )
  197. if retval is not link:
  198. return CacheEntry(retval, persistent=True)
  199. retval = self._ephem_cache.get(
  200. link=link,
  201. package_name=package_name,
  202. supported_tags=supported_tags,
  203. )
  204. if retval is not link:
  205. return CacheEntry(retval, persistent=False)
  206. return None
  207. @staticmethod
  208. def record_download_origin(cache_dir: str, download_info: DirectUrl) -> None:
  209. origin_path = Path(cache_dir) / ORIGIN_JSON_NAME
  210. if origin_path.is_file():
  211. origin = DirectUrl.from_json(origin_path.read_text())
  212. # TODO: use DirectUrl.equivalent when https://github.com/pypa/pip/pull/10564
  213. # is merged.
  214. if origin.url != download_info.url:
  215. logger.warning(
  216. "Origin URL %s in cache entry %s does not match download URL %s. "
  217. "This is likely a pip bug or a cache corruption issue.",
  218. origin.url,
  219. cache_dir,
  220. download_info.url,
  221. )
  222. origin_path.write_text(download_info.to_json(), encoding="utf-8")