summaryrefslogtreecommitdiffstats
path: root/src/cargo/sources/registry
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/cargo/sources/registry/download.rs130
-rw-r--r--src/cargo/sources/registry/http_remote.rs802
-rw-r--r--src/cargo/sources/registry/index.rs889
-rw-r--r--src/cargo/sources/registry/local.rs149
-rw-r--r--src/cargo/sources/registry/mod.rs970
-rw-r--r--src/cargo/sources/registry/remote.rs358
6 files changed, 3298 insertions, 0 deletions
diff --git a/src/cargo/sources/registry/download.rs b/src/cargo/sources/registry/download.rs
new file mode 100644
index 0000000..723c55f
--- /dev/null
+++ b/src/cargo/sources/registry/download.rs
@@ -0,0 +1,130 @@
+use anyhow::Context;
+use cargo_util::Sha256;
+
+use crate::core::PackageId;
+use crate::sources::registry::make_dep_prefix;
+use crate::sources::registry::MaybeLock;
+use crate::sources::registry::{
+ RegistryConfig, CHECKSUM_TEMPLATE, CRATE_TEMPLATE, LOWER_PREFIX_TEMPLATE, PREFIX_TEMPLATE,
+ VERSION_TEMPLATE,
+};
+use crate::util::auth;
+use crate::util::errors::CargoResult;
+use crate::util::{Config, Filesystem};
+use std::fmt::Write as FmtWrite;
+use std::fs::{self, File, OpenOptions};
+use std::io::prelude::*;
+use std::io::SeekFrom;
+use std::str;
+
+pub(super) fn filename(pkg: PackageId) -> String {
+ format!("{}-{}.crate", pkg.name(), pkg.version())
+}
+
+pub(super) fn download(
+ cache_path: &Filesystem,
+ config: &Config,
+ pkg: PackageId,
+ checksum: &str,
+ registry_config: RegistryConfig,
+) -> CargoResult<MaybeLock> {
+ let filename = filename(pkg);
+ let path = cache_path.join(&filename);
+ let path = config.assert_package_cache_locked(&path);
+
+ // Attempt to open a read-only copy first to avoid an exclusive write
+ // lock and also work with read-only filesystems. Note that we check the
+ // length of the file like below to handle interrupted downloads.
+ //
+ // If this fails then we fall through to the exclusive path where we may
+ // have to redownload the file.
+ if let Ok(dst) = File::open(path) {
+ let meta = dst.metadata()?;
+ if meta.len() > 0 {
+ return Ok(MaybeLock::Ready(dst));
+ }
+ }
+
+ let mut url = registry_config.dl;
+ if !url.contains(CRATE_TEMPLATE)
+ && !url.contains(VERSION_TEMPLATE)
+ && !url.contains(PREFIX_TEMPLATE)
+ && !url.contains(LOWER_PREFIX_TEMPLATE)
+ && !url.contains(CHECKSUM_TEMPLATE)
+ {
+ // Original format before customizing the download URL was supported.
+ write!(
+ url,
+ "/{}/{}/download",
+ pkg.name(),
+ pkg.version().to_string()
+ )
+ .unwrap();
+ } else {
+ let prefix = make_dep_prefix(&*pkg.name());
+ url = url
+ .replace(CRATE_TEMPLATE, &*pkg.name())
+ .replace(VERSION_TEMPLATE, &pkg.version().to_string())
+ .replace(PREFIX_TEMPLATE, &prefix)
+ .replace(LOWER_PREFIX_TEMPLATE, &prefix.to_lowercase())
+ .replace(CHECKSUM_TEMPLATE, checksum);
+ }
+
+ let authorization = if registry_config.auth_required {
+ Some(auth::auth_token(config, &pkg.source_id(), None, None)?)
+ } else {
+ None
+ };
+
+ Ok(MaybeLock::Download {
+ url,
+ descriptor: pkg.to_string(),
+ authorization: authorization,
+ })
+}
+
+pub(super) fn finish_download(
+ cache_path: &Filesystem,
+ config: &Config,
+ pkg: PackageId,
+ checksum: &str,
+ data: &[u8],
+) -> CargoResult<File> {
+ // Verify what we just downloaded
+ let actual = Sha256::new().update(data).finish_hex();
+ if actual != checksum {
+ anyhow::bail!("failed to verify the checksum of `{}`", pkg)
+ }
+
+ let filename = filename(pkg);
+ cache_path.create_dir()?;
+ let path = cache_path.join(&filename);
+ let path = config.assert_package_cache_locked(&path);
+ let mut dst = OpenOptions::new()
+ .create(true)
+ .read(true)
+ .write(true)
+ .open(&path)
+ .with_context(|| format!("failed to open `{}`", path.display()))?;
+ let meta = dst.metadata()?;
+ if meta.len() > 0 {
+ return Ok(dst);
+ }
+
+ dst.write_all(data)?;
+ dst.seek(SeekFrom::Start(0))?;
+ Ok(dst)
+}
+
+pub(super) fn is_crate_downloaded(
+ cache_path: &Filesystem,
+ config: &Config,
+ pkg: PackageId,
+) -> bool {
+ let path = cache_path.join(filename(pkg));
+ let path = config.assert_package_cache_locked(&path);
+ if let Ok(meta) = fs::metadata(path) {
+ return meta.len() > 0;
+ }
+ false
+}
diff --git a/src/cargo/sources/registry/http_remote.rs b/src/cargo/sources/registry/http_remote.rs
new file mode 100644
index 0000000..c3bcadf
--- /dev/null
+++ b/src/cargo/sources/registry/http_remote.rs
@@ -0,0 +1,802 @@
+//! Access to a HTTP-based crate registry.
+//!
+//! See [`HttpRegistry`] for details.
+
+use crate::core::{PackageId, SourceId};
+use crate::ops::{self};
+use crate::sources::registry::download;
+use crate::sources::registry::MaybeLock;
+use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData};
+use crate::util::errors::{CargoResult, HttpNotSuccessful};
+use crate::util::network::Retry;
+use crate::util::{auth, Config, Filesystem, IntoUrl, Progress, ProgressStyle};
+use anyhow::Context;
+use cargo_util::paths;
+use curl::easy::{HttpVersion, List};
+use curl::multi::{EasyHandle, Multi};
+use log::{debug, trace, warn};
+use std::cell::RefCell;
+use std::collections::{HashMap, HashSet};
+use std::fs::{self, File};
+use std::io::ErrorKind;
+use std::path::{Path, PathBuf};
+use std::str;
+use std::task::{ready, Poll};
+use std::time::Duration;
+use url::Url;
+
+// HTTP headers
+const ETAG: &'static str = "etag";
+const LAST_MODIFIED: &'static str = "last-modified";
+const WWW_AUTHENTICATE: &'static str = "www-authenticate";
+const IF_NONE_MATCH: &'static str = "if-none-match";
+const IF_MODIFIED_SINCE: &'static str = "if-modified-since";
+
+const UNKNOWN: &'static str = "Unknown";
+
+/// A registry served by the HTTP-based registry API.
+///
+/// This type is primarily accessed through the [`RegistryData`] trait.
+///
+/// `HttpRegistry` implements the HTTP-based registry API outlined in [RFC 2789]. Read the RFC for
+/// the complete protocol, but _roughly_ the implementation loads each index file (e.g.,
+/// config.json or re/ge/regex) from an HTTP service rather than from a locally cloned git
+/// repository. The remote service can more or less be a static file server that simply serves the
+/// contents of the origin git repository.
+///
+/// Implemented naively, this leads to a significant amount of network traffic, as a lookup of any
+/// index file would need to check with the remote backend if the index file has changed. This
+/// cost is somewhat mitigated by the use of HTTP conditional fetches (`If-Modified-Since` and
+/// `If-None-Match` for `ETag`s) which can be efficiently handled by HTTP/2.
+///
+/// [RFC 2789]: https://github.com/rust-lang/rfcs/pull/2789
+pub struct HttpRegistry<'cfg> {
+ index_path: Filesystem,
+ cache_path: Filesystem,
+ source_id: SourceId,
+ config: &'cfg Config,
+
+ /// Store the server URL without the protocol prefix (sparse+)
+ url: Url,
+
+ /// HTTP multi-handle for asynchronous/parallel requests.
+ multi: Multi,
+
+ /// Has the client requested a cache update?
+ ///
+ /// Only if they have do we double-check the freshness of each locally-stored index file.
+ requested_update: bool,
+
+ /// State for currently pending index downloads.
+ downloads: Downloads<'cfg>,
+
+ /// Does the config say that we can use HTTP multiplexing?
+ multiplexing: bool,
+
+ /// What paths have we already fetched since the last index update?
+ ///
+ /// We do not need to double-check any of these index files since we have already done so.
+ fresh: HashSet<PathBuf>,
+
+ /// Have we started to download any index files?
+ fetch_started: bool,
+
+ /// Cached registry configuration.
+ registry_config: Option<RegistryConfig>,
+
+ /// Should we include the authorization header?
+ auth_required: bool,
+
+ /// Url to get a token for the registry.
+ login_url: Option<Url>,
+}
+
+/// Helper for downloading crates.
+pub struct Downloads<'cfg> {
+ /// When a download is started, it is added to this map. The key is a
+ /// "token" (see `Download::token`). It is removed once the download is
+ /// finished.
+ pending: HashMap<usize, (Download<'cfg>, EasyHandle)>,
+ /// Set of paths currently being downloaded.
+ /// This should stay in sync with `pending`.
+ pending_paths: HashSet<PathBuf>,
+ /// The final result of each download.
+ results: HashMap<PathBuf, CargoResult<CompletedDownload>>,
+ /// The next ID to use for creating a token (see `Download::token`).
+ next: usize,
+ /// Progress bar.
+ progress: RefCell<Option<Progress<'cfg>>>,
+ /// Number of downloads that have successfully finished.
+ downloads_finished: usize,
+ /// Number of times the caller has requested blocking. This is used for
+ /// an estimate of progress.
+ blocking_calls: usize,
+}
+
+struct Download<'cfg> {
+ /// The token for this download, used as the key of the `Downloads::pending` map
+ /// and stored in `EasyHandle` as well.
+ token: usize,
+
+ /// The path of the package that we're downloading.
+ path: PathBuf,
+
+ /// Actual downloaded data, updated throughout the lifetime of this download.
+ data: RefCell<Vec<u8>>,
+
+ /// HTTP headers.
+ header_map: RefCell<Headers>,
+
+ /// Logic used to track retrying this download if it's a spurious failure.
+ retry: Retry<'cfg>,
+}
+
+#[derive(Default)]
+struct Headers {
+ last_modified: Option<String>,
+ etag: Option<String>,
+ www_authenticate: Vec<String>,
+}
+
+enum StatusCode {
+ Success,
+ NotModified,
+ NotFound,
+ Unauthorized,
+}
+
+struct CompletedDownload {
+ response_code: StatusCode,
+ data: Vec<u8>,
+ header_map: Headers,
+}
+
+impl<'cfg> HttpRegistry<'cfg> {
+ pub fn new(
+ source_id: SourceId,
+ config: &'cfg Config,
+ name: &str,
+ ) -> CargoResult<HttpRegistry<'cfg>> {
+ let url = source_id.url().as_str();
+ // Ensure the url ends with a slash so we can concatenate paths.
+ if !url.ends_with('/') {
+ anyhow::bail!("sparse registry url must end in a slash `/`: {url}")
+ }
+ assert!(source_id.is_sparse());
+ let url = url
+ .strip_prefix("sparse+")
+ .expect("sparse registry needs sparse+ prefix")
+ .into_url()
+ .expect("a url with the sparse+ stripped should still be valid");
+
+ Ok(HttpRegistry {
+ index_path: config.registry_index_path().join(name),
+ cache_path: config.registry_cache_path().join(name),
+ source_id,
+ config,
+ url,
+ multi: Multi::new(),
+ multiplexing: false,
+ downloads: Downloads {
+ next: 0,
+ pending: HashMap::new(),
+ pending_paths: HashSet::new(),
+ results: HashMap::new(),
+ progress: RefCell::new(Some(Progress::with_style(
+ "Fetch",
+ ProgressStyle::Indeterminate,
+ config,
+ ))),
+ downloads_finished: 0,
+ blocking_calls: 0,
+ },
+ fresh: HashSet::new(),
+ requested_update: false,
+ fetch_started: false,
+ registry_config: None,
+ auth_required: false,
+ login_url: None,
+ })
+ }
+
+ fn handle_http_header(buf: &[u8]) -> Option<(&str, &str)> {
+ if buf.is_empty() {
+ return None;
+ }
+ let buf = std::str::from_utf8(buf).ok()?.trim_end();
+ // Don't let server sneak extra lines anywhere.
+ if buf.contains('\n') {
+ return None;
+ }
+ let (tag, value) = buf.split_once(':')?;
+ let value = value.trim();
+ Some((tag, value))
+ }
+
+ fn start_fetch(&mut self) -> CargoResult<()> {
+ if self.fetch_started {
+ // We only need to run the setup code once.
+ return Ok(());
+ }
+ self.fetch_started = true;
+
+ // We've enabled the `http2` feature of `curl` in Cargo, so treat
+ // failures here as fatal as it would indicate a build-time problem.
+ self.multiplexing = self.config.http_config()?.multiplexing.unwrap_or(true);
+
+ self.multi
+ .pipelining(false, self.multiplexing)
+ .with_context(|| "failed to enable multiplexing/pipelining in curl")?;
+
+ // let's not flood the server with connections
+ self.multi.set_max_host_connections(2)?;
+
+ self.config
+ .shell()
+ .status("Updating", self.source_id.display_index())?;
+
+ Ok(())
+ }
+
+ fn handle_completed_downloads(&mut self) -> CargoResult<()> {
+ assert_eq!(
+ self.downloads.pending.len(),
+ self.downloads.pending_paths.len()
+ );
+
+ // Collect the results from the Multi handle.
+ let results = {
+ let mut results = Vec::new();
+ let pending = &mut self.downloads.pending;
+ self.multi.messages(|msg| {
+ let token = msg.token().expect("failed to read token");
+ let (_, handle) = &pending[&token];
+ if let Some(result) = msg.result_for(handle) {
+ results.push((token, result));
+ };
+ });
+ results
+ };
+ for (token, result) in results {
+ let (mut download, handle) = self.downloads.pending.remove(&token).unwrap();
+ let mut handle = self.multi.remove(handle)?;
+ let data = download.data.take();
+ let url = self.full_url(&download.path);
+ let result = match download.retry.r#try(|| {
+ result.with_context(|| format!("failed to download from `{}`", url))?;
+ let code = handle.response_code()?;
+ // Keep this list of expected status codes in sync with the codes handled in `load`
+ let code = match code {
+ 200 => StatusCode::Success,
+ 304 => StatusCode::NotModified,
+ 401 => StatusCode::Unauthorized,
+ 404 | 410 | 451 => StatusCode::NotFound,
+ code => {
+ let url = handle.effective_url()?.unwrap_or(&url);
+ return Err(HttpNotSuccessful {
+ code,
+ url: url.to_owned(),
+ body: data,
+ }
+ .into());
+ }
+ };
+ Ok((data, code))
+ }) {
+ Ok(Some((data, code))) => Ok(CompletedDownload {
+ response_code: code,
+ data,
+ header_map: download.header_map.take(),
+ }),
+ Ok(None) => {
+ // retry the operation
+ let handle = self.multi.add(handle)?;
+ self.downloads.pending.insert(token, (download, handle));
+ continue;
+ }
+ Err(e) => Err(e),
+ };
+
+ assert!(self.downloads.pending_paths.remove(&download.path));
+ self.downloads.results.insert(download.path, result);
+ self.downloads.downloads_finished += 1;
+ }
+
+ self.downloads.tick()?;
+
+ Ok(())
+ }
+
+ fn full_url(&self, path: &Path) -> String {
+ // self.url always ends with a slash.
+ format!("{}{}", self.url, path.display())
+ }
+
+ fn is_fresh(&self, path: &Path) -> bool {
+ if !self.requested_update {
+ trace!(
+ "using local {} as user did not request update",
+ path.display()
+ );
+ true
+ } else if self.config.cli_unstable().no_index_update {
+ trace!("using local {} in no_index_update mode", path.display());
+ true
+ } else if self.config.offline() {
+ trace!("using local {} in offline mode", path.display());
+ true
+ } else if self.fresh.contains(path) {
+ trace!("using local {} as it was already fetched", path.display());
+ true
+ } else {
+ debug!("checking freshness of {}", path.display());
+ false
+ }
+ }
+
+ /// Get the cached registry configuration, if it exists.
+ fn config_cached(&mut self) -> CargoResult<Option<&RegistryConfig>> {
+ if self.registry_config.is_some() {
+ return Ok(self.registry_config.as_ref());
+ }
+ let config_json_path = self
+ .assert_index_locked(&self.index_path)
+ .join("config.json");
+ match fs::read(&config_json_path) {
+ Ok(raw_data) => match serde_json::from_slice(&raw_data) {
+ Ok(json) => {
+ self.registry_config = Some(json);
+ }
+ Err(e) => log::debug!("failed to decode cached config.json: {}", e),
+ },
+ Err(e) => {
+ if e.kind() != ErrorKind::NotFound {
+ log::debug!("failed to read config.json cache: {}", e)
+ }
+ }
+ }
+ Ok(self.registry_config.as_ref())
+ }
+
+ /// Get the registry configuration.
+ fn config(&mut self) -> Poll<CargoResult<&RegistryConfig>> {
+ debug!("loading config");
+ let index_path = self.assert_index_locked(&self.index_path);
+ let config_json_path = index_path.join("config.json");
+ if self.is_fresh(Path::new("config.json")) && self.config_cached()?.is_some() {
+ return Poll::Ready(Ok(self.registry_config.as_ref().unwrap()));
+ }
+
+ match ready!(self.load(Path::new(""), Path::new("config.json"), None)?) {
+ LoadResponse::Data {
+ raw_data,
+ index_version: _,
+ } => {
+ trace!("config loaded");
+ self.registry_config = Some(serde_json::from_slice(&raw_data)?);
+ if paths::create_dir_all(&config_json_path.parent().unwrap()).is_ok() {
+ if let Err(e) = fs::write(&config_json_path, &raw_data) {
+ log::debug!("failed to write config.json cache: {}", e);
+ }
+ }
+ Poll::Ready(Ok(self.registry_config.as_ref().unwrap()))
+ }
+ LoadResponse::NotFound => {
+ Poll::Ready(Err(anyhow::anyhow!("config.json not found in registry")))
+ }
+ LoadResponse::CacheValid => Poll::Ready(Err(crate::util::internal(
+ "config.json is never stored in the index cache",
+ ))),
+ }
+ }
+}
+
+impl<'cfg> RegistryData for HttpRegistry<'cfg> {
+ fn prepare(&self) -> CargoResult<()> {
+ Ok(())
+ }
+
+ fn index_path(&self) -> &Filesystem {
+ &self.index_path
+ }
+
+ fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path {
+ self.config.assert_package_cache_locked(path)
+ }
+
+ fn is_updated(&self) -> bool {
+ self.requested_update
+ }
+
+ fn load(
+ &mut self,
+ _root: &Path,
+ path: &Path,
+ index_version: Option<&str>,
+ ) -> Poll<CargoResult<LoadResponse>> {
+ trace!("load: {}", path.display());
+ if let Some(_token) = self.downloads.pending_paths.get(path) {
+ debug!("dependency is still pending: {}", path.display());
+ return Poll::Pending;
+ }
+
+ if let Some(index_version) = index_version {
+ trace!(
+ "local cache of {} is available at version `{}`",
+ path.display(),
+ index_version
+ );
+ if self.is_fresh(path) {
+ return Poll::Ready(Ok(LoadResponse::CacheValid));
+ }
+ } else if self.fresh.contains(path) {
+ // We have no cached copy of this file, and we already downloaded it.
+ debug!(
+ "cache did not contain previously downloaded file {}",
+ path.display()
+ );
+ return Poll::Ready(Ok(LoadResponse::NotFound));
+ }
+
+ if let Some(result) = self.downloads.results.remove(path) {
+ let result =
+ result.with_context(|| format!("download of {} failed", path.display()))?;
+
+ assert!(
+ self.fresh.insert(path.to_path_buf()),
+ "downloaded the index file `{}` twice",
+ path.display()
+ );
+
+ // The status handled here need to be kept in sync with the codes handled
+ // in `handle_completed_downloads`
+ match result.response_code {
+ StatusCode::Success => {
+ let response_index_version = if let Some(etag) = result.header_map.etag {
+ format!("{}: {}", ETAG, etag)
+ } else if let Some(lm) = result.header_map.last_modified {
+ format!("{}: {}", LAST_MODIFIED, lm)
+ } else {
+ UNKNOWN.to_string()
+ };
+ trace!("index file version: {}", response_index_version);
+ return Poll::Ready(Ok(LoadResponse::Data {
+ raw_data: result.data,
+ index_version: Some(response_index_version),
+ }));
+ }
+ StatusCode::NotModified => {
+ // Not Modified: the data in the cache is still the latest.
+ if index_version.is_none() {
+ return Poll::Ready(Err(anyhow::anyhow!(
+ "server said not modified (HTTP 304) when no local cache exists"
+ )));
+ }
+ return Poll::Ready(Ok(LoadResponse::CacheValid));
+ }
+ StatusCode::NotFound => {
+ // The crate was not found or deleted from the registry.
+ return Poll::Ready(Ok(LoadResponse::NotFound));
+ }
+ StatusCode::Unauthorized
+ if !self.auth_required
+ && path == Path::new("config.json")
+ && self.config.cli_unstable().registry_auth =>
+ {
+ debug!("re-attempting request for config.json with authorization included.");
+ self.fresh.remove(path);
+ self.auth_required = true;
+
+ // Look for a `www-authenticate` header with the `Cargo` scheme.
+ for header in &result.header_map.www_authenticate {
+ for challenge in http_auth::ChallengeParser::new(header) {
+ match challenge {
+ Ok(challenge) if challenge.scheme.eq_ignore_ascii_case("Cargo") => {
+ // Look for the `login_url` parameter.
+ for (param, value) in challenge.params {
+ if param.eq_ignore_ascii_case("login_url") {
+ self.login_url = Some(value.to_unescaped().into_url()?);
+ }
+ }
+ }
+ Ok(challenge) => {
+ debug!("ignoring non-Cargo challenge: {}", challenge.scheme)
+ }
+ Err(e) => debug!("failed to parse challenge: {}", e),
+ }
+ }
+ }
+ }
+ StatusCode::Unauthorized => {
+ let err = Err(HttpNotSuccessful {
+ code: 401,
+ body: result.data,
+ url: self.full_url(path),
+ }
+ .into());
+ if self.auth_required {
+ return Poll::Ready(err.context(auth::AuthorizationError {
+ sid: self.source_id.clone(),
+ login_url: self.login_url.clone(),
+ reason: auth::AuthorizationErrorReason::TokenRejected,
+ }));
+ } else {
+ return Poll::Ready(err);
+ }
+ }
+ }
+ }
+
+ if path != Path::new("config.json") {
+ self.auth_required = ready!(self.config()?).auth_required;
+ } else if !self.auth_required {
+ // Check if there's a cached config that says auth is required.
+ // This allows avoiding the initial unauthenticated request to probe.
+ if let Some(config) = self.config_cached()? {
+ self.auth_required = config.auth_required;
+ }
+ }
+
+ if !self.config.cli_unstable().registry_auth {
+ self.auth_required = false;
+ }
+
+ // Looks like we're going to have to do a network request.
+ self.start_fetch()?;
+
+ let mut handle = ops::http_handle(self.config)?;
+ let full_url = self.full_url(path);
+ debug!("fetch {}", full_url);
+ handle.get(true)?;
+ handle.url(&full_url)?;
+ handle.follow_location(true)?;
+
+ // Enable HTTP/2 if possible.
+ if self.multiplexing {
+ crate::try_old_curl!(handle.http_version(HttpVersion::V2), "HTTP2");
+ } else {
+ handle.http_version(HttpVersion::V11)?;
+ }
+
+ // This is an option to `libcurl` which indicates that if there's a
+ // bunch of parallel requests to the same host they all wait until the
+ // pipelining status of the host is known. This means that we won't
+ // initiate dozens of connections to crates.io, but rather only one.
+ // Once the main one is opened we realized that pipelining is possible
+ // and multiplexing is possible with static.crates.io. All in all this
+ // reduces the number of connections done to a more manageable state.
+ crate::try_old_curl!(handle.pipewait(true), "pipewait");
+
+ let mut headers = List::new();
+ // Include a header to identify the protocol. This allows the server to
+ // know that Cargo is attempting to use the sparse protocol.
+ headers.append("cargo-protocol: version=1")?;
+ headers.append("accept: text/plain")?;
+
+ // If we have a cached copy of the file, include IF_NONE_MATCH or IF_MODIFIED_SINCE header.
+ if let Some(index_version) = index_version {
+ if let Some((key, value)) = index_version.split_once(':') {
+ match key {
+ ETAG => headers.append(&format!("{}: {}", IF_NONE_MATCH, value.trim()))?,
+ LAST_MODIFIED => {
+ headers.append(&format!("{}: {}", IF_MODIFIED_SINCE, value.trim()))?
+ }
+ _ => debug!("unexpected index version: {}", index_version),
+ }
+ }
+ }
+ if self.auth_required {
+ let authorization =
+ auth::auth_token(self.config, &self.source_id, self.login_url.as_ref(), None)?;
+ headers.append(&format!("Authorization: {}", authorization))?;
+ trace!("including authorization for {}", full_url);
+ }
+ handle.http_headers(headers)?;
+
+ // We're going to have a bunch of downloads all happening "at the same time".
+ // So, we need some way to track what headers/data/responses are for which request.
+ // We do that through this token. Each request (and associated response) gets one.
+ let token = self.downloads.next;
+ self.downloads.next += 1;
+ debug!("downloading {} as {}", path.display(), token);
+ assert!(
+ self.downloads.pending_paths.insert(path.to_path_buf()),
+ "path queued for download more than once"
+ );
+
+ // Each write should go to self.downloads.pending[&token].data.
+ // Since the write function must be 'static, we access downloads through a thread-local.
+ // That thread-local is set up in `block_until_ready` when it calls self.multi.perform,
+ // which is what ultimately calls this method.
+ handle.write_function(move |buf| {
+ trace!("{} - {} bytes of data", token, buf.len());
+ tls::with(|downloads| {
+ if let Some(downloads) = downloads {
+ downloads.pending[&token]
+ .0
+ .data
+ .borrow_mut()
+ .extend_from_slice(buf);
+ }
+ });
+ Ok(buf.len())
+ })?;
+
+ // And ditto for the header function.
+ handle.header_function(move |buf| {
+ if let Some((tag, value)) = Self::handle_http_header(buf) {
+ tls::with(|downloads| {
+ if let Some(downloads) = downloads {
+ let mut header_map = downloads.pending[&token].0.header_map.borrow_mut();
+ match tag.to_ascii_lowercase().as_str() {
+ LAST_MODIFIED => header_map.last_modified = Some(value.to_string()),
+ ETAG => header_map.etag = Some(value.to_string()),
+ WWW_AUTHENTICATE => header_map.www_authenticate.push(value.to_string()),
+ _ => {}
+ }
+ }
+ });
+ }
+
+ true
+ })?;
+
+ let dl = Download {
+ token,
+ path: path.to_path_buf(),
+ data: RefCell::new(Vec::new()),
+ header_map: Default::default(),
+ retry: Retry::new(self.config)?,
+ };
+
+ // Finally add the request we've lined up to the pool of requests that cURL manages.
+ let mut handle = self.multi.add(handle)?;
+ handle.set_token(token)?;
+ self.downloads.pending.insert(dl.token, (dl, handle));
+
+ Poll::Pending
+ }
+
+ fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> {
+ let mut cfg = ready!(self.config()?).clone();
+ if !self.config.cli_unstable().registry_auth {
+ cfg.auth_required = false;
+ }
+ Poll::Ready(Ok(Some(cfg)))
+ }
+
+ fn invalidate_cache(&mut self) {
+ // Actually updating the index is more or less a no-op for this implementation.
+ // All it does is ensure that a subsequent load will double-check files with the
+ // server rather than rely on a locally cached copy of the index files.
+ debug!("invalidated index cache");
+ self.fresh.clear();
+ self.requested_update = true;
+ }
+
+ fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> {
+ let registry_config = loop {
+ match self.config()? {
+ Poll::Pending => self.block_until_ready()?,
+ Poll::Ready(cfg) => break cfg.to_owned(),
+ }
+ };
+ download::download(
+ &self.cache_path,
+ &self.config,
+ pkg,
+ checksum,
+ registry_config,
+ )
+ }
+
+ fn finish_download(
+ &mut self,
+ pkg: PackageId,
+ checksum: &str,
+ data: &[u8],
+ ) -> CargoResult<File> {
+ download::finish_download(&self.cache_path, &self.config, pkg, checksum, data)
+ }
+
+ fn is_crate_downloaded(&self, pkg: PackageId) -> bool {
+ download::is_crate_downloaded(&self.cache_path, &self.config, pkg)
+ }
+
+ fn block_until_ready(&mut self) -> CargoResult<()> {
+ trace!(
+ "block_until_ready: {} transfers pending",
+ self.downloads.pending.len()
+ );
+ self.downloads.blocking_calls += 1;
+
+ loop {
+ self.handle_completed_downloads()?;
+
+ let remaining_in_multi = tls::set(&self.downloads, || {
+ self.multi
+ .perform()
+ .with_context(|| "failed to perform http requests")
+ })?;
+ trace!("{} transfers remaining", remaining_in_multi);
+
+ if remaining_in_multi == 0 {
+ return Ok(());
+ }
+
+ // We have no more replies to provide the caller with,
+ // so we need to wait until cURL has something new for us.
+ let timeout = self
+ .multi
+ .get_timeout()?
+ .unwrap_or_else(|| Duration::new(1, 0));
+ self.multi
+ .wait(&mut [], timeout)
+ .with_context(|| "failed to wait on curl `Multi`")?;
+ }
+ }
+}
+
+impl<'cfg> Downloads<'cfg> {
+ fn tick(&self) -> CargoResult<()> {
+ let mut progress = self.progress.borrow_mut();
+ let progress = progress.as_mut().unwrap();
+
+ // Since the sparse protocol discovers dependencies as it goes,
+ // it's not possible to get an accurate progress indication.
+ //
+ // As an approximation, we assume that the depth of the dependency graph
+ // is fixed, and base the progress on how many times the caller has asked
+ // for blocking. If there are actually additional dependencies, the progress
+ // bar will get stuck. If there are fewer dependencies, it will disappear
+ // early. It will never go backwards.
+ //
+ // The status text also contains the number of completed & pending requests, which
+ // gives an better indication of forward progress.
+ let approximate_tree_depth = 10;
+
+ progress.tick(
+ self.blocking_calls.min(approximate_tree_depth),
+ approximate_tree_depth + 1,
+ &format!(
+ " {} complete; {} pending",
+ self.downloads_finished,
+ self.pending.len()
+ ),
+ )
+ }
+}
+
+mod tls {
+ use super::Downloads;
+ use std::cell::Cell;
+
+ thread_local!(static PTR: Cell<usize> = Cell::new(0));
+
+ pub(crate) fn with<R>(f: impl FnOnce(Option<&Downloads<'_>>) -> R) -> R {
+ let ptr = PTR.with(|p| p.get());
+ if ptr == 0 {
+ f(None)
+ } else {
+ // Safety: * `ptr` is only set by `set` below which ensures the type is correct.
+ let ptr = unsafe { &*(ptr as *const Downloads<'_>) };
+ f(Some(ptr))
+ }
+ }
+
+ pub(crate) fn set<R>(dl: &Downloads<'_>, f: impl FnOnce() -> R) -> R {
+ struct Reset<'a, T: Copy>(&'a Cell<T>, T);
+
+ impl<'a, T: Copy> Drop for Reset<'a, T> {
+ fn drop(&mut self) {
+ self.0.set(self.1);
+ }
+ }
+
+ PTR.with(|p| {
+ let _reset = Reset(p, p.get());
+ p.set(dl as *const Downloads<'_> as usize);
+ f()
+ })
+ }
+}
diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs
new file mode 100644
index 0000000..633ed74
--- /dev/null
+++ b/src/cargo/sources/registry/index.rs
@@ -0,0 +1,889 @@
+//! Management of the index of a registry source
+//!
+//! This module contains management of the index and various operations, such as
+//! actually parsing the index, looking for crates, etc. This is intended to be
+//! abstract over remote indices (downloaded via git) and local registry indices
+//! (which are all just present on the filesystem).
+//!
+//! ## Index Performance
+//!
+//! One important aspect of the index is that we want to optimize the "happy
+//! path" as much as possible. Whenever you type `cargo build` Cargo will
+//! *always* reparse the registry and learn about dependency information. This
+//! is done because Cargo needs to learn about the upstream crates.io crates
+//! that you're using and ensure that the preexisting `Cargo.lock` still matches
+//! the current state of the world.
+//!
+//! Consequently, Cargo "null builds" (the index that Cargo adds to each build
+//! itself) need to be fast when accessing the index. The primary performance
+//! optimization here is to avoid parsing JSON blobs from the registry if we
+//! don't need them. Most secondary optimizations are centered around removing
+//! allocations and such, but avoiding parsing JSON is the #1 optimization.
+//!
+//! When we get queries from the resolver we're given a `Dependency`. This
+//! dependency in turn has a version requirement, and with lock files that
+//! already exist these version requirements are exact version requirements
+//! `=a.b.c`. This means that we in theory only need to parse one line of JSON
+//! per query in the registry, the one that matches version `a.b.c`.
+//!
+//! The crates.io index, however, is not amenable to this form of query. Instead
+//! the crates.io index simply is a file where each line is a JSON blob. To
+//! learn about the versions in each JSON blob we would need to parse the JSON,
+//! defeating the purpose of trying to parse as little as possible.
+//!
+//! > Note that as a small aside even *loading* the JSON from the registry is
+//! > actually pretty slow. For crates.io and remote registries we don't
+//! > actually check out the git index on disk because that takes quite some
+//! > time and is quite large. Instead we use `libgit2` to read the JSON from
+//! > the raw git objects. This in turn can be slow (aka show up high in
+//! > profiles) because libgit2 has to do deflate decompression and such.
+//!
+//! To solve all these issues a strategy is employed here where Cargo basically
+//! creates an index into the index. The first time a package is queried about
+//! (first time being for an entire computer) Cargo will load the contents
+//! (slowly via libgit2) from the registry. It will then (slowly) parse every
+//! single line to learn about its versions. Afterwards, however, Cargo will
+//! emit a new file (a cache) which is amenable for speedily parsing in future
+//! invocations.
+//!
+//! This cache file is currently organized by basically having the semver
+//! version extracted from each JSON blob. That way Cargo can quickly and easily
+//! parse all versions contained and which JSON blob they're associated with.
+//! The JSON blob then doesn't actually need to get parsed unless the version is
+//! parsed.
+//!
+//! Altogether the initial measurements of this shows a massive improvement for
+//! Cargo null build performance. It's expected that the improvements earned
+//! here will continue to grow over time in the sense that the previous
+//! implementation (parse all lines each time) actually continues to slow down
+//! over time as new versions of a crate are published. In any case when first
+//! implemented a null build of Cargo itself would parse 3700 JSON blobs from
+//! the registry and load 150 blobs from git. Afterwards it parses 150 JSON
+//! blobs and loads 0 files git. Removing 200ms or more from Cargo's startup
+//! time is certainly nothing to sneeze at!
+//!
+//! Note that this is just a high-level overview, there's of course lots of
+//! details like invalidating caches and whatnot which are handled below, but
+//! hopefully those are more obvious inline in the code itself.
+
+use crate::core::dependency::Dependency;
+use crate::core::{PackageId, SourceId, Summary};
+use crate::sources::registry::{LoadResponse, RegistryData, RegistryPackage, INDEX_V_MAX};
+use crate::util::interning::InternedString;
+use crate::util::{internal, CargoResult, Config, Filesystem, OptVersionReq, ToSemver};
+use anyhow::bail;
+use cargo_util::{paths, registry::make_dep_path};
+use log::{debug, info};
+use semver::Version;
+use std::collections::{HashMap, HashSet};
+use std::fs;
+use std::io::ErrorKind;
+use std::path::Path;
+use std::str;
+use std::task::{ready, Poll};
+
+/// Crates.io treats hyphen and underscores as interchangeable, but the index and old Cargo do not.
+/// Therefore, the index must store uncanonicalized version of the name so old Cargo's can find it.
+/// This loop tries all possible combinations of switching hyphen and underscores to find the
+/// uncanonicalized one. As all stored inputs have the correct spelling, we start with the spelling
+/// as-provided.
+struct UncanonicalizedIter<'s> {
+ input: &'s str,
+ num_hyphen_underscore: u32,
+ hyphen_combination_num: u16,
+}
+
+impl<'s> UncanonicalizedIter<'s> {
+ fn new(input: &'s str) -> Self {
+ let num_hyphen_underscore = input.chars().filter(|&c| c == '_' || c == '-').count() as u32;
+ UncanonicalizedIter {
+ input,
+ num_hyphen_underscore,
+ hyphen_combination_num: 0,
+ }
+ }
+}
+
+impl<'s> Iterator for UncanonicalizedIter<'s> {
+ type Item = String;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.hyphen_combination_num > 0
+ && self.hyphen_combination_num.trailing_zeros() >= self.num_hyphen_underscore
+ {
+ return None;
+ }
+
+ let ret = Some(
+ self.input
+ .chars()
+ .scan(0u16, |s, c| {
+ // the check against 15 here's to prevent
+ // shift overflow on inputs with more than 15 hyphens
+ if (c == '_' || c == '-') && *s <= 15 {
+ let switch = (self.hyphen_combination_num & (1u16 << *s)) > 0;
+ let out = if (c == '_') ^ switch { '_' } else { '-' };
+ *s += 1;
+ Some(out)
+ } else {
+ Some(c)
+ }
+ })
+ .collect(),
+ );
+ self.hyphen_combination_num += 1;
+ ret
+ }
+}
+
+#[test]
+fn no_hyphen() {
+ assert_eq!(
+ UncanonicalizedIter::new("test").collect::<Vec<_>>(),
+ vec!["test".to_string()]
+ )
+}
+
+#[test]
+fn two_hyphen() {
+ assert_eq!(
+ UncanonicalizedIter::new("te-_st").collect::<Vec<_>>(),
+ vec![
+ "te-_st".to_string(),
+ "te__st".to_string(),
+ "te--st".to_string(),
+ "te_-st".to_string()
+ ]
+ )
+}
+
+#[test]
+fn overflow_hyphen() {
+ assert_eq!(
+ UncanonicalizedIter::new("te-_-_-_-_-_-_-_-_-st")
+ .take(100)
+ .count(),
+ 100
+ )
+}
+
+/// Manager for handling the on-disk index.
+///
+/// Note that local and remote registries store the index differently. Local
+/// is a simple on-disk tree of files of the raw index. Remote registries are
+/// stored as a raw git repository. The different means of access are handled
+/// via the [`RegistryData`] trait abstraction.
+///
+/// This transparently handles caching of the index in a more efficient format.
+pub struct RegistryIndex<'cfg> {
+ source_id: SourceId,
+ /// Root directory of the index for the registry.
+ path: Filesystem,
+ /// Cache of summary data.
+ ///
+ /// This is keyed off the package name. The [`Summaries`] value handles
+ /// loading the summary data. It keeps an optimized on-disk representation
+ /// of the JSON files, which is created in an as-needed fashion. If it
+ /// hasn't been cached already, it uses [`RegistryData::load`] to access
+ /// to JSON files from the index, and the creates the optimized on-disk
+ /// summary cache.
+ summaries_cache: HashMap<InternedString, Summaries>,
+ /// [`Config`] reference for convenience.
+ config: &'cfg Config,
+}
+
+/// An internal cache of summaries for a particular package.
+///
+/// A list of summaries are loaded from disk via one of two methods:
+///
+/// 1. Primarily Cargo will parse the corresponding file for a crate in the
+/// upstream crates.io registry. That's just a JSON blob per line which we
+/// can parse, extract the version, and then store here.
+///
+/// 2. Alternatively, if Cargo has previously run, we'll have a cached index of
+/// dependencies for the upstream index. This is a file that Cargo maintains
+/// lazily on the local filesystem and is much faster to parse since it
+/// doesn't involve parsing all of the JSON.
+///
+/// The outward-facing interface of this doesn't matter too much where it's
+/// loaded from, but it's important when reading the implementation to note that
+/// we try to parse as little as possible!
+#[derive(Default)]
+struct Summaries {
+ /// A raw vector of uninterpreted bytes. This is what `Unparsed` start/end
+ /// fields are indexes into. If a `Summaries` is loaded from the crates.io
+ /// index then this field will be empty since nothing is `Unparsed`.
+ raw_data: Vec<u8>,
+
+ /// All known versions of a crate, keyed from their `Version` to the
+ /// possibly parsed or unparsed version of the full summary.
+ versions: HashMap<Version, MaybeIndexSummary>,
+}
+
+/// A lazily parsed `IndexSummary`.
+enum MaybeIndexSummary {
+ /// A summary which has not been parsed, The `start` and `end` are pointers
+ /// into `Summaries::raw_data` which this is an entry of.
+ Unparsed { start: usize, end: usize },
+
+ /// An actually parsed summary.
+ Parsed(IndexSummary),
+}
+
+/// A parsed representation of a summary from the index.
+///
+/// In addition to a full `Summary` we have information on whether it is `yanked`.
+pub struct IndexSummary {
+ pub summary: Summary,
+ pub yanked: bool,
+ /// Schema version, see [`RegistryPackage`].
+ v: u32,
+}
+
+/// A representation of the cache on disk that Cargo maintains of summaries.
+/// Cargo will initially parse all summaries in the registry and will then
+/// serialize that into this form and place it in a new location on disk,
+/// ensuring that access in the future is much speedier.
+#[derive(Default)]
+struct SummariesCache<'a> {
+ versions: Vec<(Version, &'a [u8])>,
+ index_version: &'a str,
+}
+
+impl<'cfg> RegistryIndex<'cfg> {
+ pub fn new(
+ source_id: SourceId,
+ path: &Filesystem,
+ config: &'cfg Config,
+ ) -> RegistryIndex<'cfg> {
+ RegistryIndex {
+ source_id,
+ path: path.clone(),
+ summaries_cache: HashMap::new(),
+ config,
+ }
+ }
+
+ /// Returns the hash listed for a specified `PackageId`.
+ pub fn hash(&mut self, pkg: PackageId, load: &mut dyn RegistryData) -> Poll<CargoResult<&str>> {
+ let req = OptVersionReq::exact(pkg.version());
+ let summary = self.summaries(pkg.name(), &req, load)?;
+ let summary = ready!(summary).next();
+ Poll::Ready(Ok(summary
+ .ok_or_else(|| internal(format!("no hash listed for {}", pkg)))?
+ .summary
+ .checksum()
+ .ok_or_else(|| internal(format!("no hash listed for {}", pkg)))?))
+ }
+
+ /// Load a list of summaries for `name` package in this registry which
+ /// match `req`
+ ///
+ /// This function will semantically parse the on-disk index, match all
+ /// versions, and then return an iterator over all summaries which matched.
+ /// Internally there's quite a few layer of caching to amortize this cost
+ /// though since this method is called quite a lot on null builds in Cargo.
+ pub fn summaries<'a, 'b>(
+ &'a mut self,
+ name: InternedString,
+ req: &'b OptVersionReq,
+ load: &mut dyn RegistryData,
+ ) -> Poll<CargoResult<impl Iterator<Item = &'a IndexSummary> + 'b>>
+ where
+ 'a: 'b,
+ {
+ let source_id = self.source_id;
+ let config = self.config;
+
+ // First up actually parse what summaries we have available. If Cargo
+ // has run previously this will parse a Cargo-specific cache file rather
+ // than the registry itself. In effect this is intended to be a quite
+ // cheap operation.
+ let summaries = ready!(self.load_summaries(name, load)?);
+
+ // Iterate over our summaries, extract all relevant ones which match our
+ // version requirement, and then parse all corresponding rows in the
+ // registry. As a reminder this `summaries` method is called for each
+ // entry in a lock file on every build, so we want to absolutely
+ // minimize the amount of work being done here and parse as little as
+ // necessary.
+ let raw_data = &summaries.raw_data;
+ Poll::Ready(Ok(summaries
+ .versions
+ .iter_mut()
+ .filter_map(move |(k, v)| if req.matches(k) { Some(v) } else { None })
+ .filter_map(
+ move |maybe| match maybe.parse(config, raw_data, source_id) {
+ Ok(summary) => Some(summary),
+ Err(e) => {
+ info!("failed to parse `{}` registry package: {}", name, e);
+ None
+ }
+ },
+ )
+ .filter(move |is| {
+ if is.v > INDEX_V_MAX {
+ debug!(
+ "unsupported schema version {} ({} {})",
+ is.v,
+ is.summary.name(),
+ is.summary.version()
+ );
+ false
+ } else {
+ true
+ }
+ })))
+ }
+
+ fn load_summaries(
+ &mut self,
+ name: InternedString,
+ load: &mut dyn RegistryData,
+ ) -> Poll<CargoResult<&mut Summaries>> {
+ // If we've previously loaded what versions are present for `name`, just
+ // return that since our cache should still be valid.
+ if self.summaries_cache.contains_key(&name) {
+ return Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap()));
+ }
+
+ // Prepare the `RegistryData` which will lazily initialize internal data
+ // structures.
+ load.prepare()?;
+
+ let root = load.assert_index_locked(&self.path);
+ let cache_root = root.join(".cache");
+
+ // See module comment in `registry/mod.rs` for why this is structured
+ // the way it is.
+ let fs_name = name
+ .chars()
+ .flat_map(|c| c.to_lowercase())
+ .collect::<String>();
+ let raw_path = make_dep_path(&fs_name, false);
+
+ let mut any_pending = false;
+ // Attempt to handle misspellings by searching for a chain of related
+ // names to the original `raw_path` name. Only return summaries
+ // associated with the first hit, however. The resolver will later
+ // reject any candidates that have the wrong name, and with this it'll
+ // along the way produce helpful "did you mean?" suggestions.
+ for (i, path) in UncanonicalizedIter::new(&raw_path).take(1024).enumerate() {
+ let summaries = Summaries::parse(
+ root,
+ &cache_root,
+ path.as_ref(),
+ self.source_id,
+ load,
+ self.config,
+ )?;
+ if summaries.is_pending() {
+ if i == 0 {
+ // If we have not herd back about the name as requested
+ // then don't ask about other spellings yet.
+ // This prevents us spamming all the variations in the
+ // case where we have the correct spelling.
+ return Poll::Pending;
+ }
+ any_pending = true;
+ }
+ if let Poll::Ready(Some(summaries)) = summaries {
+ self.summaries_cache.insert(name, summaries);
+ return Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap()));
+ }
+ }
+
+ if any_pending {
+ return Poll::Pending;
+ }
+
+ // If nothing was found then this crate doesn't exists, so just use an
+ // empty `Summaries` list.
+ self.summaries_cache.insert(name, Summaries::default());
+ Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap()))
+ }
+
+ /// Clears the in-memory summaries cache.
+ pub fn clear_summaries_cache(&mut self) {
+ self.summaries_cache.clear();
+ }
+
+ pub fn query_inner(
+ &mut self,
+ dep: &Dependency,
+ load: &mut dyn RegistryData,
+ yanked_whitelist: &HashSet<PackageId>,
+ f: &mut dyn FnMut(Summary),
+ ) -> Poll<CargoResult<()>> {
+ if self.config.offline() {
+ // This should only return `Poll::Ready(Ok(()))` if there is at least 1 match.
+ //
+ // If there are 0 matches it should fall through and try again with online.
+ // This is necessary for dependencies that are not used (such as
+ // target-cfg or optional), but are not downloaded. Normally the
+ // build should succeed if they are not downloaded and not used,
+ // but they still need to resolve. If they are actually needed
+ // then cargo will fail to download and an error message
+ // indicating that the required dependency is unavailable while
+ // offline will be displayed.
+ if ready!(self.query_inner_with_online(dep, load, yanked_whitelist, f, false)?) > 0 {
+ return Poll::Ready(Ok(()));
+ }
+ }
+ self.query_inner_with_online(dep, load, yanked_whitelist, f, true)
+ .map_ok(|_| ())
+ }
+
+ fn query_inner_with_online(
+ &mut self,
+ dep: &Dependency,
+ load: &mut dyn RegistryData,
+ yanked_whitelist: &HashSet<PackageId>,
+ f: &mut dyn FnMut(Summary),
+ online: bool,
+ ) -> Poll<CargoResult<usize>> {
+ let source_id = self.source_id;
+
+ let summaries = ready!(self.summaries(dep.package_name(), dep.version_req(), load))?;
+
+ let summaries = summaries
+ // First filter summaries for `--offline`. If we're online then
+ // everything is a candidate, otherwise if we're offline we're only
+ // going to consider candidates which are actually present on disk.
+ //
+ // Note: This particular logic can cause problems with
+ // optional dependencies when offline. If at least 1 version
+ // of an optional dependency is downloaded, but that version
+ // does not satisfy the requirements, then resolution will
+ // fail. Unfortunately, whether or not something is optional
+ // is not known here.
+ .filter(|s| (online || load.is_crate_downloaded(s.summary.package_id())))
+ // Next filter out all yanked packages. Some yanked packages may
+ // leak through if they're in a whitelist (aka if they were
+ // previously in `Cargo.lock`
+ .filter(|s| !s.yanked || yanked_whitelist.contains(&s.summary.package_id()))
+ .map(|s| s.summary.clone());
+
+ // Handle `cargo update --precise` here. If specified, our own source
+ // will have a precise version listed of the form
+ // `<pkg>=<p_req>o-><f_req>` where `<pkg>` is the name of a crate on
+ // this source, `<p_req>` is the version installed and `<f_req> is the
+ // version requested (argument to `--precise`).
+ let name = dep.package_name().as_str();
+ let precise = match source_id.precise() {
+ Some(p) if p.starts_with(name) && p[name.len()..].starts_with('=') => {
+ let mut vers = p[name.len() + 1..].splitn(2, "->");
+ let current_vers = vers.next().unwrap().to_semver().unwrap();
+ let requested_vers = vers.next().unwrap().to_semver().unwrap();
+ Some((current_vers, requested_vers))
+ }
+ _ => None,
+ };
+ let summaries = summaries.filter(|s| match &precise {
+ Some((current, requested)) => {
+ if dep.version_req().matches(current) {
+ // Unfortunately crates.io allows versions to differ only
+ // by build metadata. This shouldn't be allowed, but since
+ // it is, this will honor it if requested. However, if not
+ // specified, then ignore it.
+ let s_vers = s.version();
+ match (s_vers.build.is_empty(), requested.build.is_empty()) {
+ (true, true) => s_vers == requested,
+ (true, false) => false,
+ (false, true) => {
+ // Strip out the metadata.
+ s_vers.major == requested.major
+ && s_vers.minor == requested.minor
+ && s_vers.patch == requested.patch
+ && s_vers.pre == requested.pre
+ }
+ (false, false) => s_vers == requested,
+ }
+ } else {
+ true
+ }
+ }
+ None => true,
+ });
+
+ let mut count = 0;
+ for summary in summaries {
+ f(summary);
+ count += 1;
+ }
+ Poll::Ready(Ok(count))
+ }
+
+ pub fn is_yanked(
+ &mut self,
+ pkg: PackageId,
+ load: &mut dyn RegistryData,
+ ) -> Poll<CargoResult<bool>> {
+ let req = OptVersionReq::exact(pkg.version());
+ let found = self
+ .summaries(pkg.name(), &req, load)
+ .map_ok(|mut p| p.any(|summary| summary.yanked));
+ found
+ }
+}
+
+impl Summaries {
+ /// Parse out a `Summaries` instances from on-disk state.
+ ///
+ /// This will attempt to prefer parsing a previous cache file that already
+ /// exists from a previous invocation of Cargo (aka you're typing `cargo
+ /// build` again after typing it previously). If parsing fails or the cache
+ /// isn't found, then we take a slower path which loads the full descriptor
+ /// for `relative` from the underlying index (aka typically libgit2 with
+ /// crates.io) and then parse everything in there.
+ ///
+ /// * `root` - this is the root argument passed to `load`
+ /// * `cache_root` - this is the root on the filesystem itself of where to
+ /// store cache files.
+ /// * `relative` - this is the file we're loading from cache or the index
+ /// data
+ /// * `source_id` - the registry's SourceId used when parsing JSON blobs to
+ /// create summaries.
+ /// * `load` - the actual index implementation which may be very slow to
+ /// call. We avoid this if we can.
+ pub fn parse(
+ root: &Path,
+ cache_root: &Path,
+ relative: &Path,
+ source_id: SourceId,
+ load: &mut dyn RegistryData,
+ config: &Config,
+ ) -> Poll<CargoResult<Option<Summaries>>> {
+ // First up, attempt to load the cache. This could fail for all manner
+ // of reasons, but consider all of them non-fatal and just log their
+ // occurrence in case anyone is debugging anything.
+ let cache_path = cache_root.join(relative);
+ let mut cached_summaries = None;
+ let mut index_version = None;
+ match fs::read(&cache_path) {
+ Ok(contents) => match Summaries::parse_cache(contents) {
+ Ok((s, v)) => {
+ cached_summaries = Some(s);
+ index_version = Some(v);
+ }
+ Err(e) => {
+ log::debug!("failed to parse {:?} cache: {}", relative, e);
+ }
+ },
+ Err(e) => log::debug!("cache missing for {:?} error: {}", relative, e),
+ }
+
+ let response = ready!(load.load(root, relative, index_version.as_deref())?);
+
+ match response {
+ LoadResponse::CacheValid => {
+ log::debug!("fast path for registry cache of {:?}", relative);
+ return Poll::Ready(Ok(cached_summaries));
+ }
+ LoadResponse::NotFound => {
+ if let Err(e) = fs::remove_file(cache_path) {
+ if e.kind() != ErrorKind::NotFound {
+ log::debug!("failed to remove from cache: {}", e);
+ }
+ }
+ return Poll::Ready(Ok(None));
+ }
+ LoadResponse::Data {
+ raw_data,
+ index_version,
+ } => {
+ // This is the fallback path where we actually talk to the registry backend to load
+ // information. Here we parse every single line in the index (as we need
+ // to find the versions)
+ log::debug!("slow path for {:?}", relative);
+ let mut cache = SummariesCache::default();
+ let mut ret = Summaries::default();
+ ret.raw_data = raw_data;
+ for line in split(&ret.raw_data, b'\n') {
+ // Attempt forwards-compatibility on the index by ignoring
+ // everything that we ourselves don't understand, that should
+ // allow future cargo implementations to break the
+ // interpretation of each line here and older cargo will simply
+ // ignore the new lines.
+ let summary = match IndexSummary::parse(config, line, source_id) {
+ Ok(summary) => summary,
+ Err(e) => {
+ // This should only happen when there is an index
+ // entry from a future version of cargo that this
+ // version doesn't understand. Hopefully, those future
+ // versions of cargo correctly set INDEX_V_MAX and
+ // CURRENT_CACHE_VERSION, otherwise this will skip
+ // entries in the cache preventing those newer
+ // versions from reading them (that is, until the
+ // cache is rebuilt).
+ log::info!("failed to parse {:?} registry package: {}", relative, e);
+ continue;
+ }
+ };
+ let version = summary.summary.package_id().version().clone();
+ cache.versions.push((version.clone(), line));
+ ret.versions.insert(version, summary.into());
+ }
+ if let Some(index_version) = index_version {
+ log::trace!("caching index_version {}", index_version);
+ let cache_bytes = cache.serialize(index_version.as_str());
+ // Once we have our `cache_bytes` which represents the `Summaries` we're
+ // about to return, write that back out to disk so future Cargo
+ // invocations can use it.
+ //
+ // This is opportunistic so we ignore failure here but are sure to log
+ // something in case of error.
+ if paths::create_dir_all(cache_path.parent().unwrap()).is_ok() {
+ let path = Filesystem::new(cache_path.clone());
+ config.assert_package_cache_locked(&path);
+ if let Err(e) = fs::write(cache_path, &cache_bytes) {
+ log::info!("failed to write cache: {}", e);
+ }
+ }
+
+ // If we've got debug assertions enabled read back in the cached values
+ // and assert they match the expected result.
+ #[cfg(debug_assertions)]
+ {
+ let readback = SummariesCache::parse(&cache_bytes)
+ .expect("failed to parse cache we just wrote");
+ assert_eq!(
+ readback.index_version, index_version,
+ "index_version mismatch"
+ );
+ assert_eq!(readback.versions, cache.versions, "versions mismatch");
+ }
+ }
+ Poll::Ready(Ok(Some(ret)))
+ }
+ }
+ }
+
+ /// Parses an open `File` which represents information previously cached by
+ /// Cargo.
+ pub fn parse_cache(contents: Vec<u8>) -> CargoResult<(Summaries, InternedString)> {
+ let cache = SummariesCache::parse(&contents)?;
+ let index_version = InternedString::new(cache.index_version);
+ let mut ret = Summaries::default();
+ for (version, summary) in cache.versions {
+ let (start, end) = subslice_bounds(&contents, summary);
+ ret.versions
+ .insert(version, MaybeIndexSummary::Unparsed { start, end });
+ }
+ ret.raw_data = contents;
+ return Ok((ret, index_version));
+
+ // Returns the start/end offsets of `inner` with `outer`. Asserts that
+ // `inner` is a subslice of `outer`.
+ fn subslice_bounds(outer: &[u8], inner: &[u8]) -> (usize, usize) {
+ let outer_start = outer.as_ptr() as usize;
+ let outer_end = outer_start + outer.len();
+ let inner_start = inner.as_ptr() as usize;
+ let inner_end = inner_start + inner.len();
+ assert!(inner_start >= outer_start);
+ assert!(inner_end <= outer_end);
+ (inner_start - outer_start, inner_end - outer_start)
+ }
+ }
+}
+
+// Implementation of serializing/deserializing the cache of summaries on disk.
+// Currently the format looks like:
+//
+// +--------------------+----------------------+-------------+---+
+// | cache version byte | index format version | git sha rev | 0 |
+// +--------------------+----------------------+-------------+---+
+//
+// followed by...
+//
+// +----------------+---+------------+---+
+// | semver version | 0 | JSON blob | 0 | ...
+// +----------------+---+------------+---+
+//
+// The idea is that this is a very easy file for Cargo to parse in future
+// invocations. The read from disk should be quite fast and then afterwards all
+// we need to know is what versions correspond to which JSON blob.
+//
+// The leading version byte is intended to ensure that there's some level of
+// future compatibility against changes to this cache format so if different
+// versions of Cargo share the same cache they don't get too confused. The git
+// sha lets us know when the file needs to be regenerated (it needs regeneration
+// whenever the index itself updates).
+//
+// Cache versions:
+// * `1`: The original version.
+// * `2`: Added the "index format version" field so that if the index format
+// changes, different versions of cargo won't get confused reading each
+// other's caches.
+// * `3`: Bumped the version to work around an issue where multiple versions of
+// a package were published that differ only by semver metadata. For
+// example, openssl-src 110.0.0 and 110.0.0+1.1.0f. Previously, the cache
+// would be incorrectly populated with two entries, both 110.0.0. After
+// this, the metadata will be correctly included. This isn't really a format
+// change, just a version bump to clear the incorrect cache entries. Note:
+// the index shouldn't allow these, but unfortunately crates.io doesn't
+// check it.
+
+const CURRENT_CACHE_VERSION: u8 = 3;
+
+impl<'a> SummariesCache<'a> {
+ fn parse(data: &'a [u8]) -> CargoResult<SummariesCache<'a>> {
+ // NB: keep this method in sync with `serialize` below
+ let (first_byte, rest) = data
+ .split_first()
+ .ok_or_else(|| anyhow::format_err!("malformed cache"))?;
+ if *first_byte != CURRENT_CACHE_VERSION {
+ bail!("looks like a different Cargo's cache, bailing out");
+ }
+ let index_v_bytes = rest
+ .get(..4)
+ .ok_or_else(|| anyhow::anyhow!("cache expected 4 bytes for index version"))?;
+ let index_v = u32::from_le_bytes(index_v_bytes.try_into().unwrap());
+ if index_v != INDEX_V_MAX {
+ bail!(
+ "index format version {} doesn't match the version I know ({})",
+ index_v,
+ INDEX_V_MAX
+ );
+ }
+ let rest = &rest[4..];
+
+ let mut iter = split(rest, 0);
+ let last_index_update = if let Some(update) = iter.next() {
+ str::from_utf8(update)?
+ } else {
+ bail!("malformed file");
+ };
+ let mut ret = SummariesCache::default();
+ ret.index_version = last_index_update;
+ while let Some(version) = iter.next() {
+ let version = str::from_utf8(version)?;
+ let version = Version::parse(version)?;
+ let summary = iter.next().unwrap();
+ ret.versions.push((version, summary));
+ }
+ Ok(ret)
+ }
+
+ fn serialize(&self, index_version: &str) -> Vec<u8> {
+ // NB: keep this method in sync with `parse` above
+ let size = self
+ .versions
+ .iter()
+ .map(|(_version, data)| (10 + data.len()))
+ .sum();
+ let mut contents = Vec::with_capacity(size);
+ contents.push(CURRENT_CACHE_VERSION);
+ contents.extend(&u32::to_le_bytes(INDEX_V_MAX));
+ contents.extend_from_slice(index_version.as_bytes());
+ contents.push(0);
+ for (version, data) in self.versions.iter() {
+ contents.extend_from_slice(version.to_string().as_bytes());
+ contents.push(0);
+ contents.extend_from_slice(data);
+ contents.push(0);
+ }
+ contents
+ }
+}
+
+impl MaybeIndexSummary {
+ /// Parses this "maybe a summary" into a `Parsed` for sure variant.
+ ///
+ /// Does nothing if this is already `Parsed`, and otherwise the `raw_data`
+ /// passed in is sliced with the bounds in `Unparsed` and then actually
+ /// parsed.
+ fn parse(
+ &mut self,
+ config: &Config,
+ raw_data: &[u8],
+ source_id: SourceId,
+ ) -> CargoResult<&IndexSummary> {
+ let (start, end) = match self {
+ MaybeIndexSummary::Unparsed { start, end } => (*start, *end),
+ MaybeIndexSummary::Parsed(summary) => return Ok(summary),
+ };
+ let summary = IndexSummary::parse(config, &raw_data[start..end], source_id)?;
+ *self = MaybeIndexSummary::Parsed(summary);
+ match self {
+ MaybeIndexSummary::Unparsed { .. } => unreachable!(),
+ MaybeIndexSummary::Parsed(summary) => Ok(summary),
+ }
+ }
+}
+
+impl From<IndexSummary> for MaybeIndexSummary {
+ fn from(summary: IndexSummary) -> MaybeIndexSummary {
+ MaybeIndexSummary::Parsed(summary)
+ }
+}
+
+impl IndexSummary {
+ /// Parses a line from the registry's index file into an `IndexSummary` for
+ /// a package.
+ ///
+ /// The `line` provided is expected to be valid JSON.
+ fn parse(config: &Config, line: &[u8], source_id: SourceId) -> CargoResult<IndexSummary> {
+ // ****CAUTION**** Please be extremely careful with returning errors
+ // from this function. Entries that error are not included in the
+ // index cache, and can cause cargo to get confused when switching
+ // between different versions that understand the index differently.
+ // Make sure to consider the INDEX_V_MAX and CURRENT_CACHE_VERSION
+ // values carefully when making changes here.
+ let RegistryPackage {
+ name,
+ vers,
+ cksum,
+ deps,
+ mut features,
+ features2,
+ yanked,
+ links,
+ v,
+ } = serde_json::from_slice(line)?;
+ let v = v.unwrap_or(1);
+ log::trace!("json parsed registry {}/{}", name, vers);
+ let pkgid = PackageId::new(name, &vers, source_id)?;
+ let deps = deps
+ .into_iter()
+ .map(|dep| dep.into_dep(source_id))
+ .collect::<CargoResult<Vec<_>>>()?;
+ if let Some(features2) = features2 {
+ for (name, values) in features2 {
+ features.entry(name).or_default().extend(values);
+ }
+ }
+ let mut summary = Summary::new(config, pkgid, deps, &features, links)?;
+ summary.set_checksum(cksum);
+ Ok(IndexSummary {
+ summary,
+ yanked: yanked.unwrap_or(false),
+ v,
+ })
+ }
+}
+
+fn split(haystack: &[u8], needle: u8) -> impl Iterator<Item = &[u8]> {
+ struct Split<'a> {
+ haystack: &'a [u8],
+ needle: u8,
+ }
+
+ impl<'a> Iterator for Split<'a> {
+ type Item = &'a [u8];
+
+ fn next(&mut self) -> Option<&'a [u8]> {
+ if self.haystack.is_empty() {
+ return None;
+ }
+ let (ret, remaining) = match memchr::memchr(self.needle, self.haystack) {
+ Some(pos) => (&self.haystack[..pos], &self.haystack[pos + 1..]),
+ None => (self.haystack, &[][..]),
+ };
+ self.haystack = remaining;
+ Some(ret)
+ }
+ }
+
+ Split { haystack, needle }
+}
diff --git a/src/cargo/sources/registry/local.rs b/src/cargo/sources/registry/local.rs
new file mode 100644
index 0000000..a4b57a9
--- /dev/null
+++ b/src/cargo/sources/registry/local.rs
@@ -0,0 +1,149 @@
+use crate::core::PackageId;
+use crate::sources::registry::{LoadResponse, MaybeLock, RegistryConfig, RegistryData};
+use crate::util::errors::CargoResult;
+use crate::util::{Config, Filesystem};
+use cargo_util::{paths, Sha256};
+use std::fs::File;
+use std::io::SeekFrom;
+use std::io::{self, prelude::*};
+use std::path::Path;
+use std::task::Poll;
+
+/// A local registry is a registry that lives on the filesystem as a set of
+/// `.crate` files with an `index` directory in the same format as a remote
+/// registry.
+pub struct LocalRegistry<'cfg> {
+ index_path: Filesystem,
+ root: Filesystem,
+ src_path: Filesystem,
+ config: &'cfg Config,
+ updated: bool,
+}
+
+impl<'cfg> LocalRegistry<'cfg> {
+ pub fn new(root: &Path, config: &'cfg Config, name: &str) -> LocalRegistry<'cfg> {
+ LocalRegistry {
+ src_path: config.registry_source_path().join(name),
+ index_path: Filesystem::new(root.join("index")),
+ root: Filesystem::new(root.to_path_buf()),
+ config,
+ updated: false,
+ }
+ }
+}
+
+impl<'cfg> RegistryData for LocalRegistry<'cfg> {
+ fn prepare(&self) -> CargoResult<()> {
+ Ok(())
+ }
+
+ fn index_path(&self) -> &Filesystem {
+ &self.index_path
+ }
+
+ fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path {
+ // Note that the `*_unlocked` variant is used here since we're not
+ // modifying the index and it's required to be externally synchronized.
+ path.as_path_unlocked()
+ }
+
+ fn load(
+ &mut self,
+ root: &Path,
+ path: &Path,
+ _index_version: Option<&str>,
+ ) -> Poll<CargoResult<LoadResponse>> {
+ if self.updated {
+ let raw_data = match paths::read_bytes(&root.join(path)) {
+ Err(e)
+ if e.downcast_ref::<io::Error>()
+ .map_or(false, |ioe| ioe.kind() == io::ErrorKind::NotFound) =>
+ {
+ return Poll::Ready(Ok(LoadResponse::NotFound));
+ }
+ r => r,
+ }?;
+ Poll::Ready(Ok(LoadResponse::Data {
+ raw_data,
+ index_version: None,
+ }))
+ } else {
+ Poll::Pending
+ }
+ }
+
+ fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> {
+ // Local registries don't have configuration for remote APIs or anything
+ // like that
+ Poll::Ready(Ok(None))
+ }
+
+ fn block_until_ready(&mut self) -> CargoResult<()> {
+ if self.updated {
+ return Ok(());
+ }
+ // Nothing to update, we just use what's on disk. Verify it actually
+ // exists though. We don't use any locks as we're just checking whether
+ // these directories exist.
+ let root = self.root.clone().into_path_unlocked();
+ if !root.is_dir() {
+ anyhow::bail!("local registry path is not a directory: {}", root.display());
+ }
+ let index_path = self.index_path.clone().into_path_unlocked();
+ if !index_path.is_dir() {
+ anyhow::bail!(
+ "local registry index path is not a directory: {}",
+ index_path.display()
+ );
+ }
+ self.updated = true;
+ Ok(())
+ }
+
+ fn invalidate_cache(&mut self) {
+ // Local registry has no cache - just reads from disk.
+ }
+
+ fn is_updated(&self) -> bool {
+ self.updated
+ }
+
+ fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> {
+ let crate_file = format!("{}-{}.crate", pkg.name(), pkg.version());
+
+ // Note that the usage of `into_path_unlocked` here is because the local
+ // crate files here never change in that we're not the one writing them,
+ // so it's not our responsibility to synchronize access to them.
+ let path = self.root.join(&crate_file).into_path_unlocked();
+ let mut crate_file = paths::open(&path)?;
+
+ // If we've already got an unpacked version of this crate, then skip the
+ // checksum below as it is in theory already verified.
+ let dst = format!("{}-{}", pkg.name(), pkg.version());
+ if self.src_path.join(dst).into_path_unlocked().exists() {
+ return Ok(MaybeLock::Ready(crate_file));
+ }
+
+ self.config.shell().status("Unpacking", pkg)?;
+
+ // We don't actually need to download anything per-se, we just need to
+ // verify the checksum matches the .crate file itself.
+ let actual = Sha256::new().update_file(&crate_file)?.finish_hex();
+ if actual != checksum {
+ anyhow::bail!("failed to verify the checksum of `{}`", pkg)
+ }
+
+ crate_file.seek(SeekFrom::Start(0))?;
+
+ Ok(MaybeLock::Ready(crate_file))
+ }
+
+ fn finish_download(
+ &mut self,
+ _pkg: PackageId,
+ _checksum: &str,
+ _data: &[u8],
+ ) -> CargoResult<File> {
+ panic!("this source doesn't download")
+ }
+}
diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs
new file mode 100644
index 0000000..930a42f
--- /dev/null
+++ b/src/cargo/sources/registry/mod.rs
@@ -0,0 +1,970 @@
+//! A `Source` for registry-based packages.
+//!
+//! # What's a Registry?
+//!
+//! Registries are central locations where packages can be uploaded to,
+//! discovered, and searched for. The purpose of a registry is to have a
+//! location that serves as permanent storage for versions of a crate over time.
+//!
+//! Compared to git sources, a registry provides many packages as well as many
+//! versions simultaneously. Git sources can also have commits deleted through
+//! rebasings where registries cannot have their versions deleted.
+//!
+//! # The Index of a Registry
+//!
+//! One of the major difficulties with a registry is that hosting so many
+//! packages may quickly run into performance problems when dealing with
+//! dependency graphs. It's infeasible for cargo to download the entire contents
+//! of the registry just to resolve one package's dependencies, for example. As
+//! a result, cargo needs some efficient method of querying what packages are
+//! available on a registry, what versions are available, and what the
+//! dependencies for each version is.
+//!
+//! One method of doing so would be having the registry expose an HTTP endpoint
+//! which can be queried with a list of packages and a response of their
+//! dependencies and versions is returned. This is somewhat inefficient however
+//! as we may have to hit the endpoint many times and we may have already
+//! queried for much of the data locally already (for other packages, for
+//! example). This also involves inventing a transport format between the
+//! registry and Cargo itself, so this route was not taken.
+//!
+//! Instead, Cargo communicates with registries through a git repository
+//! referred to as the Index. The Index of a registry is essentially an easily
+//! query-able version of the registry's database for a list of versions of a
+//! package as well as a list of dependencies for each version.
+//!
+//! Using git to host this index provides a number of benefits:
+//!
+//! * The entire index can be stored efficiently locally on disk. This means
+//! that all queries of a registry can happen locally and don't need to touch
+//! the network.
+//!
+//! * Updates of the index are quite efficient. Using git buys incremental
+//! updates, compressed transmission, etc for free. The index must be updated
+//! each time we need fresh information from a registry, but this is one
+//! update of a git repository that probably hasn't changed a whole lot so
+//! it shouldn't be too expensive.
+//!
+//! Additionally, each modification to the index is just appending a line at
+//! the end of a file (the exact format is described later). This means that
+//! the commits for an index are quite small and easily applied/compressible.
+//!
+//! ## The format of the Index
+//!
+//! The index is a store for the list of versions for all packages known, so its
+//! format on disk is optimized slightly to ensure that `ls registry` doesn't
+//! produce a list of all packages ever known. The index also wants to ensure
+//! that there's not a million files which may actually end up hitting
+//! filesystem limits at some point. To this end, a few decisions were made
+//! about the format of the registry:
+//!
+//! 1. Each crate will have one file corresponding to it. Each version for a
+//! crate will just be a line in this file.
+//! 2. There will be two tiers of directories for crate names, under which
+//! crates corresponding to those tiers will be located.
+//!
+//! As an example, this is an example hierarchy of an index:
+//!
+//! ```notrust
+//! .
+//! ├── 3
+//! │   └── u
+//! │   └── url
+//! ├── bz
+//! │   └── ip
+//! │   └── bzip2
+//! ├── config.json
+//! ├── en
+//! │   └── co
+//! │   └── encoding
+//! └── li
+//!    ├── bg
+//!    │   └── libgit2
+//!    └── nk
+//!    └── link-config
+//! ```
+//!
+//! The root of the index contains a `config.json` file with a few entries
+//! corresponding to the registry (see [`RegistryConfig`] below).
+//!
+//! Otherwise, there are three numbered directories (1, 2, 3) for crates with
+//! names 1, 2, and 3 characters in length. The 1/2 directories simply have the
+//! crate files underneath them, while the 3 directory is sharded by the first
+//! letter of the crate name.
+//!
+//! Otherwise the top-level directory contains many two-letter directory names,
+//! each of which has many sub-folders with two letters. At the end of all these
+//! are the actual crate files themselves.
+//!
+//! The purpose of this layout is to hopefully cut down on `ls` sizes as well as
+//! efficient lookup based on the crate name itself.
+//!
+//! ## Crate files
+//!
+//! Each file in the index is the history of one crate over time. Each line in
+//! the file corresponds to one version of a crate, stored in JSON format (see
+//! the `RegistryPackage` structure below).
+//!
+//! As new versions are published, new lines are appended to this file. The only
+//! modifications to this file that should happen over time are yanks of a
+//! particular version.
+//!
+//! # Downloading Packages
+//!
+//! The purpose of the Index was to provide an efficient method to resolve the
+//! dependency graph for a package. So far we only required one network
+//! interaction to update the registry's repository (yay!). After resolution has
+//! been performed, however we need to download the contents of packages so we
+//! can read the full manifest and build the source code.
+//!
+//! To accomplish this, this source's `download` method will make an HTTP
+//! request per-package requested to download tarballs into a local cache. These
+//! tarballs will then be unpacked into a destination folder.
+//!
+//! Note that because versions uploaded to the registry are frozen forever that
+//! the HTTP download and unpacking can all be skipped if the version has
+//! already been downloaded and unpacked. This caching allows us to only
+//! download a package when absolutely necessary.
+//!
+//! # Filesystem Hierarchy
+//!
+//! Overall, the `$HOME/.cargo` looks like this when talking about the registry:
+//!
+//! ```notrust
+//! # A folder under which all registry metadata is hosted (similar to
+//! # $HOME/.cargo/git)
+//! $HOME/.cargo/registry/
+//!
+//! # For each registry that cargo knows about (keyed by hostname + hash)
+//! # there is a folder which is the checked out version of the index for
+//! # the registry in this location. Note that this is done so cargo can
+//! # support multiple registries simultaneously
+//! index/
+//! registry1-<hash>/
+//! registry2-<hash>/
+//! ...
+//!
+//! # This folder is a cache for all downloaded tarballs from a registry.
+//! # Once downloaded and verified, a tarball never changes.
+//! cache/
+//! registry1-<hash>/<pkg>-<version>.crate
+//! ...
+//!
+//! # Location in which all tarballs are unpacked. Each tarball is known to
+//! # be frozen after downloading, so transitively this folder is also
+//! # frozen once its unpacked (it's never unpacked again)
+//! src/
+//! registry1-<hash>/<pkg>-<version>/...
+//! ...
+//! ```
+
+use std::borrow::Cow;
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fs::{File, OpenOptions};
+use std::io::{self, Write};
+use std::path::{Path, PathBuf};
+use std::task::Poll;
+
+use anyhow::Context as _;
+use cargo_util::paths::{self, exclude_from_backups_and_indexing};
+use flate2::read::GzDecoder;
+use log::debug;
+use semver::Version;
+use serde::Deserialize;
+use tar::Archive;
+
+use crate::core::dependency::{DepKind, Dependency};
+use crate::core::source::MaybePackage;
+use crate::core::{Package, PackageId, QueryKind, Source, SourceId, Summary};
+use crate::sources::PathSource;
+use crate::util::hex;
+use crate::util::interning::InternedString;
+use crate::util::into_url::IntoUrl;
+use crate::util::network::PollExt;
+use crate::util::{
+ restricted_names, CargoResult, Config, Filesystem, LimitErrorReader, OptVersionReq,
+};
+
+const PACKAGE_SOURCE_LOCK: &str = ".cargo-ok";
+pub const CRATES_IO_INDEX: &str = "https://github.com/rust-lang/crates.io-index";
+pub const CRATES_IO_HTTP_INDEX: &str = "sparse+https://index.crates.io/";
+pub const CRATES_IO_REGISTRY: &str = "crates-io";
+pub const CRATES_IO_DOMAIN: &str = "crates.io";
+const CRATE_TEMPLATE: &str = "{crate}";
+const VERSION_TEMPLATE: &str = "{version}";
+const PREFIX_TEMPLATE: &str = "{prefix}";
+const LOWER_PREFIX_TEMPLATE: &str = "{lowerprefix}";
+const CHECKSUM_TEMPLATE: &str = "{sha256-checksum}";
+const MAX_UNPACK_SIZE: u64 = 512 * 1024 * 1024;
+const MAX_COMPRESSION_RATIO: usize = 20; // 20:1
+
+/// A "source" for a local (see `local::LocalRegistry`) or remote (see
+/// `remote::RemoteRegistry`) registry.
+///
+/// This contains common functionality that is shared between the two registry
+/// kinds, with the registry-specific logic implemented as part of the
+/// [`RegistryData`] trait referenced via the `ops` field.
+pub struct RegistrySource<'cfg> {
+ source_id: SourceId,
+ /// The path where crate files are extracted (`$CARGO_HOME/registry/src/$REG-HASH`).
+ src_path: Filesystem,
+ /// Local reference to [`Config`] for convenience.
+ config: &'cfg Config,
+ /// Abstraction for interfacing to the different registry kinds.
+ ops: Box<dyn RegistryData + 'cfg>,
+ /// Interface for managing the on-disk index.
+ index: index::RegistryIndex<'cfg>,
+ /// A set of packages that should be allowed to be used, even if they are
+ /// yanked.
+ ///
+ /// This is populated from the entries in `Cargo.lock` to ensure that
+ /// `cargo update -p somepkg` won't unlock yanked entries in `Cargo.lock`.
+ /// Otherwise, the resolver would think that those entries no longer
+ /// exist, and it would trigger updates to unrelated packages.
+ yanked_whitelist: HashSet<PackageId>,
+}
+
+/// The `config.json` file stored in the index.
+#[derive(Deserialize, Debug, Clone)]
+#[serde(rename_all = "kebab-case")]
+pub struct RegistryConfig {
+ /// Download endpoint for all crates.
+ ///
+ /// The string is a template which will generate the download URL for the
+ /// tarball of a specific version of a crate. The substrings `{crate}` and
+ /// `{version}` will be replaced with the crate's name and version
+ /// respectively. The substring `{prefix}` will be replaced with the
+ /// crate's prefix directory name, and the substring `{lowerprefix}` will
+ /// be replaced with the crate's prefix directory name converted to
+ /// lowercase. The substring `{sha256-checksum}` will be replaced with the
+ /// crate's sha256 checksum.
+ ///
+ /// For backwards compatibility, if the string does not contain any
+ /// markers (`{crate}`, `{version}`, `{prefix}`, or ``{lowerprefix}`), it
+ /// will be extended with `/{crate}/{version}/download` to
+ /// support registries like crates.io which were created before the
+ /// templating setup was created.
+ pub dl: String,
+
+ /// API endpoint for the registry. This is what's actually hit to perform
+ /// operations like yanks, owner modifications, publish new crates, etc.
+ /// If this is None, the registry does not support API commands.
+ pub api: Option<String>,
+
+ /// Whether all operations require authentication.
+ #[serde(default)]
+ pub auth_required: bool,
+}
+
+/// The maximum version of the `v` field in the index this version of cargo
+/// understands.
+pub(crate) const INDEX_V_MAX: u32 = 2;
+
+/// A single line in the index representing a single version of a package.
+#[derive(Deserialize)]
+pub struct RegistryPackage<'a> {
+ name: InternedString,
+ vers: Version,
+ #[serde(borrow)]
+ deps: Vec<RegistryDependency<'a>>,
+ features: BTreeMap<InternedString, Vec<InternedString>>,
+ /// This field contains features with new, extended syntax. Specifically,
+ /// namespaced features (`dep:`) and weak dependencies (`pkg?/feat`).
+ ///
+ /// This is separated from `features` because versions older than 1.19
+ /// will fail to load due to not being able to parse the new syntax, even
+ /// with a `Cargo.lock` file.
+ features2: Option<BTreeMap<InternedString, Vec<InternedString>>>,
+ cksum: String,
+ /// If `true`, Cargo will skip this version when resolving.
+ ///
+ /// This was added in 2014. Everything in the crates.io index has this set
+ /// now, so this probably doesn't need to be an option anymore.
+ yanked: Option<bool>,
+ /// Native library name this package links to.
+ ///
+ /// Added early 2018 (see <https://github.com/rust-lang/cargo/pull/4978>),
+ /// can be `None` if published before then.
+ links: Option<InternedString>,
+ /// The schema version for this entry.
+ ///
+ /// If this is None, it defaults to version 1. Entries with unknown
+ /// versions are ignored.
+ ///
+ /// Version `2` format adds the `features2` field.
+ ///
+ /// This provides a method to safely introduce changes to index entries
+ /// and allow older versions of cargo to ignore newer entries it doesn't
+ /// understand. This is honored as of 1.51, so unfortunately older
+ /// versions will ignore it, and potentially misinterpret version 2 and
+ /// newer entries.
+ ///
+ /// The intent is that versions older than 1.51 will work with a
+ /// pre-existing `Cargo.lock`, but they may not correctly process `cargo
+ /// update` or build a lock from scratch. In that case, cargo may
+ /// incorrectly select a new package that uses a new index format. A
+ /// workaround is to downgrade any packages that are incompatible with the
+ /// `--precise` flag of `cargo update`.
+ v: Option<u32>,
+}
+
+#[test]
+fn escaped_char_in_json() {
+ let _: RegistryPackage<'_> = serde_json::from_str(
+ r#"{"name":"a","vers":"0.0.1","deps":[],"cksum":"bae3","features":{}}"#,
+ )
+ .unwrap();
+ let _: RegistryPackage<'_> = serde_json::from_str(
+ r#"{"name":"a","vers":"0.0.1","deps":[],"cksum":"bae3","features":{"test":["k","q"]},"links":"a-sys"}"#
+ ).unwrap();
+
+ // Now we add escaped cher all the places they can go
+ // these are not valid, but it should error later than json parsing
+ let _: RegistryPackage<'_> = serde_json::from_str(
+ r#"{
+ "name":"This name has a escaped cher in it \n\t\" ",
+ "vers":"0.0.1",
+ "deps":[{
+ "name": " \n\t\" ",
+ "req": " \n\t\" ",
+ "features": [" \n\t\" "],
+ "optional": true,
+ "default_features": true,
+ "target": " \n\t\" ",
+ "kind": " \n\t\" ",
+ "registry": " \n\t\" "
+ }],
+ "cksum":"bae3",
+ "features":{"test \n\t\" ":["k \n\t\" ","q \n\t\" "]},
+ "links":" \n\t\" "}"#,
+ )
+ .unwrap();
+}
+
+/// A dependency as encoded in the index JSON.
+#[derive(Deserialize)]
+struct RegistryDependency<'a> {
+ name: InternedString,
+ #[serde(borrow)]
+ req: Cow<'a, str>,
+ features: Vec<InternedString>,
+ optional: bool,
+ default_features: bool,
+ target: Option<Cow<'a, str>>,
+ kind: Option<Cow<'a, str>>,
+ registry: Option<Cow<'a, str>>,
+ package: Option<InternedString>,
+ public: Option<bool>,
+}
+
+impl<'a> RegistryDependency<'a> {
+ /// Converts an encoded dependency in the registry to a cargo dependency
+ pub fn into_dep(self, default: SourceId) -> CargoResult<Dependency> {
+ let RegistryDependency {
+ name,
+ req,
+ mut features,
+ optional,
+ default_features,
+ target,
+ kind,
+ registry,
+ package,
+ public,
+ } = self;
+
+ let id = if let Some(registry) = &registry {
+ SourceId::for_registry(&registry.into_url()?)?
+ } else {
+ default
+ };
+
+ let mut dep = Dependency::parse(package.unwrap_or(name), Some(&req), id)?;
+ if package.is_some() {
+ dep.set_explicit_name_in_toml(name);
+ }
+ let kind = match kind.as_deref().unwrap_or("") {
+ "dev" => DepKind::Development,
+ "build" => DepKind::Build,
+ _ => DepKind::Normal,
+ };
+
+ let platform = match target {
+ Some(target) => Some(target.parse()?),
+ None => None,
+ };
+
+ // All dependencies are private by default
+ let public = public.unwrap_or(false);
+
+ // Unfortunately older versions of cargo and/or the registry ended up
+ // publishing lots of entries where the features array contained the
+ // empty feature, "", inside. This confuses the resolution process much
+ // later on and these features aren't actually valid, so filter them all
+ // out here.
+ features.retain(|s| !s.is_empty());
+
+ // In index, "registry" is null if it is from the same index.
+ // In Cargo.toml, "registry" is None if it is from the default
+ if !id.is_crates_io() {
+ dep.set_registry_id(id);
+ }
+
+ dep.set_optional(optional)
+ .set_default_features(default_features)
+ .set_features(features)
+ .set_platform(platform)
+ .set_kind(kind)
+ .set_public(public);
+
+ Ok(dep)
+ }
+}
+
+/// Result from loading data from a registry.
+pub enum LoadResponse {
+ /// The cache is valid. The cached data should be used.
+ CacheValid,
+
+ /// The cache is out of date. Returned data should be used.
+ Data {
+ raw_data: Vec<u8>,
+ index_version: Option<String>,
+ },
+
+ /// The requested crate was found.
+ NotFound,
+}
+
+/// An abstract interface to handle both a local (see `local::LocalRegistry`)
+/// and remote (see `remote::RemoteRegistry`) registry.
+///
+/// This allows [`RegistrySource`] to abstractly handle both registry kinds.
+pub trait RegistryData {
+ /// Performs initialization for the registry.
+ ///
+ /// This should be safe to call multiple times, the implementation is
+ /// expected to not do any work if it is already prepared.
+ fn prepare(&self) -> CargoResult<()>;
+
+ /// Returns the path to the index.
+ ///
+ /// Note that different registries store the index in different formats
+ /// (remote=git, local=files).
+ fn index_path(&self) -> &Filesystem;
+
+ /// Loads the JSON for a specific named package from the index.
+ ///
+ /// * `root` is the root path to the index.
+ /// * `path` is the relative path to the package to load (like `ca/rg/cargo`).
+ /// * `index_version` is the version of the requested crate data currently in cache.
+ fn load(
+ &mut self,
+ root: &Path,
+ path: &Path,
+ index_version: Option<&str>,
+ ) -> Poll<CargoResult<LoadResponse>>;
+
+ /// Loads the `config.json` file and returns it.
+ ///
+ /// Local registries don't have a config, and return `None`.
+ fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>>;
+
+ /// Invalidates locally cached data.
+ fn invalidate_cache(&mut self);
+
+ /// Is the local cached data up-to-date?
+ fn is_updated(&self) -> bool;
+
+ /// Prepare to start downloading a `.crate` file.
+ ///
+ /// Despite the name, this doesn't actually download anything. If the
+ /// `.crate` is already downloaded, then it returns [`MaybeLock::Ready`].
+ /// If it hasn't been downloaded, then it returns [`MaybeLock::Download`]
+ /// which contains the URL to download. The [`crate::core::package::Downloads`]
+ /// system handles the actual download process. After downloading, it
+ /// calls [`Self::finish_download`] to save the downloaded file.
+ ///
+ /// `checksum` is currently only used by local registries to verify the
+ /// file contents (because local registries never actually download
+ /// anything). Remote registries will validate the checksum in
+ /// `finish_download`. For already downloaded `.crate` files, it does not
+ /// validate the checksum, assuming the filesystem does not suffer from
+ /// corruption or manipulation.
+ fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock>;
+
+ /// Finish a download by saving a `.crate` file to disk.
+ ///
+ /// After [`crate::core::package::Downloads`] has finished a download,
+ /// it will call this to save the `.crate` file. This is only relevant
+ /// for remote registries. This should validate the checksum and save
+ /// the given data to the on-disk cache.
+ ///
+ /// Returns a [`File`] handle to the `.crate` file, positioned at the start.
+ fn finish_download(&mut self, pkg: PackageId, checksum: &str, data: &[u8])
+ -> CargoResult<File>;
+
+ /// Returns whether or not the `.crate` file is already downloaded.
+ fn is_crate_downloaded(&self, _pkg: PackageId) -> bool {
+ true
+ }
+
+ /// Validates that the global package cache lock is held.
+ ///
+ /// Given the [`Filesystem`], this will make sure that the package cache
+ /// lock is held. If not, it will panic. See
+ /// [`Config::acquire_package_cache_lock`] for acquiring the global lock.
+ ///
+ /// Returns the [`Path`] to the [`Filesystem`].
+ fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path;
+
+ /// Block until all outstanding Poll::Pending requests are Poll::Ready.
+ fn block_until_ready(&mut self) -> CargoResult<()>;
+}
+
+/// The status of [`RegistryData::download`] which indicates if a `.crate`
+/// file has already been downloaded, or if not then the URL to download.
+pub enum MaybeLock {
+ /// The `.crate` file is already downloaded. [`File`] is a handle to the
+ /// opened `.crate` file on the filesystem.
+ Ready(File),
+ /// The `.crate` file is not downloaded, here's the URL to download it from.
+ ///
+ /// `descriptor` is just a text string to display to the user of what is
+ /// being downloaded.
+ Download {
+ url: String,
+ descriptor: String,
+ authorization: Option<String>,
+ },
+}
+
+mod download;
+mod http_remote;
+mod index;
+mod local;
+mod remote;
+
+fn short_name(id: SourceId) -> String {
+ let hash = hex::short_hash(&id);
+ let ident = id.url().host_str().unwrap_or("").to_string();
+ format!("{}-{}", ident, hash)
+}
+
+impl<'cfg> RegistrySource<'cfg> {
+ pub fn remote(
+ source_id: SourceId,
+ yanked_whitelist: &HashSet<PackageId>,
+ config: &'cfg Config,
+ ) -> CargoResult<RegistrySource<'cfg>> {
+ assert!(source_id.is_remote_registry());
+ let name = short_name(source_id);
+ let ops = if source_id.is_sparse() {
+ Box::new(http_remote::HttpRegistry::new(source_id, config, &name)?) as Box<_>
+ } else {
+ Box::new(remote::RemoteRegistry::new(source_id, config, &name)) as Box<_>
+ };
+
+ Ok(RegistrySource::new(
+ source_id,
+ config,
+ &name,
+ ops,
+ yanked_whitelist,
+ ))
+ }
+
+ pub fn local(
+ source_id: SourceId,
+ path: &Path,
+ yanked_whitelist: &HashSet<PackageId>,
+ config: &'cfg Config,
+ ) -> RegistrySource<'cfg> {
+ let name = short_name(source_id);
+ let ops = local::LocalRegistry::new(path, config, &name);
+ RegistrySource::new(source_id, config, &name, Box::new(ops), yanked_whitelist)
+ }
+
+ fn new(
+ source_id: SourceId,
+ config: &'cfg Config,
+ name: &str,
+ ops: Box<dyn RegistryData + 'cfg>,
+ yanked_whitelist: &HashSet<PackageId>,
+ ) -> RegistrySource<'cfg> {
+ RegistrySource {
+ src_path: config.registry_source_path().join(name),
+ config,
+ source_id,
+ index: index::RegistryIndex::new(source_id, ops.index_path(), config),
+ yanked_whitelist: yanked_whitelist.clone(),
+ ops,
+ }
+ }
+
+ /// Decode the configuration stored within the registry.
+ ///
+ /// This requires that the index has been at least checked out.
+ pub fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> {
+ self.ops.config()
+ }
+
+ /// Unpacks a downloaded package into a location where it's ready to be
+ /// compiled.
+ ///
+ /// No action is taken if the source looks like it's already unpacked.
+ fn unpack_package(&self, pkg: PackageId, tarball: &File) -> CargoResult<PathBuf> {
+ // The `.cargo-ok` file is used to track if the source is already
+ // unpacked.
+ let package_dir = format!("{}-{}", pkg.name(), pkg.version());
+ let dst = self.src_path.join(&package_dir);
+ let path = dst.join(PACKAGE_SOURCE_LOCK);
+ let path = self.config.assert_package_cache_locked(&path);
+ let unpack_dir = path.parent().unwrap();
+ match path.metadata() {
+ Ok(meta) if meta.len() > 0 => return Ok(unpack_dir.to_path_buf()),
+ Ok(_meta) => {
+ // The `.cargo-ok` file is not in a state we expect it to be
+ // (with two bytes containing "ok").
+ //
+ // Cargo has always included a `.cargo-ok` file to detect if
+ // extraction was interrupted, but it was originally empty.
+ //
+ // In 1.34, Cargo was changed to create the `.cargo-ok` file
+ // before it started extraction to implement fine-grained
+ // locking. After it was finished extracting, it wrote two
+ // bytes to indicate it was complete. It would use the length
+ // check to detect if it was possibly interrupted.
+ //
+ // In 1.36, Cargo changed to not use fine-grained locking, and
+ // instead used a global lock. The use of `.cargo-ok` was no
+ // longer needed for locking purposes, but was kept to detect
+ // when extraction was interrupted.
+ //
+ // In 1.49, Cargo changed to not create the `.cargo-ok` file
+ // before it started extraction to deal with `.crate` files
+ // that inexplicably had a `.cargo-ok` file in them.
+ //
+ // In 1.64, Cargo changed to detect `.crate` files with
+ // `.cargo-ok` files in them in response to CVE-2022-36113,
+ // which dealt with malicious `.crate` files making
+ // `.cargo-ok` a symlink causing cargo to write "ok" to any
+ // arbitrary file on the filesystem it has permission to.
+ //
+ // This is all a long-winded way of explaining the
+ // circumstances that might cause a directory to contain a
+ // `.cargo-ok` file that is empty or otherwise corrupted.
+ // Either this was extracted by a version of Rust before 1.34,
+ // in which case everything should be fine. However, an empty
+ // file created by versions 1.36 to 1.49 indicates that the
+ // extraction was interrupted and that we need to start again.
+ //
+ // Another possibility is that the filesystem is simply
+ // corrupted, in which case deleting the directory might be
+ // the safe thing to do. That is probably unlikely, though.
+ //
+ // To be safe, this deletes the directory and starts over
+ // again.
+ log::warn!("unexpected length of {path:?}, clearing cache");
+ paths::remove_dir_all(dst.as_path_unlocked())?;
+ }
+ Err(e) if e.kind() == io::ErrorKind::NotFound => {}
+ Err(e) => anyhow::bail!("failed to access package completion {path:?}: {e}"),
+ }
+ dst.create_dir()?;
+ let mut tar = {
+ let size_limit = max_unpack_size(self.config, tarball.metadata()?.len());
+ let gz = GzDecoder::new(tarball);
+ let gz = LimitErrorReader::new(gz, size_limit);
+ Archive::new(gz)
+ };
+ let prefix = unpack_dir.file_name().unwrap();
+ let parent = unpack_dir.parent().unwrap();
+ for entry in tar.entries()? {
+ let mut entry = entry.with_context(|| "failed to iterate over archive")?;
+ let entry_path = entry
+ .path()
+ .with_context(|| "failed to read entry path")?
+ .into_owned();
+
+ // We're going to unpack this tarball into the global source
+ // directory, but we want to make sure that it doesn't accidentally
+ // (or maliciously) overwrite source code from other crates. Cargo
+ // itself should never generate a tarball that hits this error, and
+ // crates.io should also block uploads with these sorts of tarballs,
+ // but be extra sure by adding a check here as well.
+ if !entry_path.starts_with(prefix) {
+ anyhow::bail!(
+ "invalid tarball downloaded, contains \
+ a file at {:?} which isn't under {:?}",
+ entry_path,
+ prefix
+ )
+ }
+ // Prevent unpacking the lockfile from the crate itself.
+ if entry_path
+ .file_name()
+ .map_or(false, |p| p == PACKAGE_SOURCE_LOCK)
+ {
+ continue;
+ }
+ // Unpacking failed
+ let mut result = entry.unpack_in(parent).map_err(anyhow::Error::from);
+ if cfg!(windows) && restricted_names::is_windows_reserved_path(&entry_path) {
+ result = result.with_context(|| {
+ format!(
+ "`{}` appears to contain a reserved Windows path, \
+ it cannot be extracted on Windows",
+ entry_path.display()
+ )
+ });
+ }
+ result
+ .with_context(|| format!("failed to unpack entry at `{}`", entry_path.display()))?;
+ }
+
+ // Now that we've finished unpacking, create and write to the lock file to indicate that
+ // unpacking was successful.
+ let mut ok = OpenOptions::new()
+ .create_new(true)
+ .read(true)
+ .write(true)
+ .open(&path)
+ .with_context(|| format!("failed to open `{}`", path.display()))?;
+ write!(ok, "ok")?;
+
+ Ok(unpack_dir.to_path_buf())
+ }
+
+ fn get_pkg(&mut self, package: PackageId, path: &File) -> CargoResult<Package> {
+ let path = self
+ .unpack_package(package, path)
+ .with_context(|| format!("failed to unpack package `{}`", package))?;
+ let mut src = PathSource::new(&path, self.source_id, self.config);
+ src.update()?;
+ let mut pkg = match src.download(package)? {
+ MaybePackage::Ready(pkg) => pkg,
+ MaybePackage::Download { .. } => unreachable!(),
+ };
+
+ // After we've loaded the package configure its summary's `checksum`
+ // field with the checksum we know for this `PackageId`.
+ let req = OptVersionReq::exact(package.version());
+ let summary_with_cksum = self
+ .index
+ .summaries(package.name(), &req, &mut *self.ops)?
+ .expect("a downloaded dep now pending!?")
+ .map(|s| s.summary.clone())
+ .next()
+ .expect("summary not found");
+ if let Some(cksum) = summary_with_cksum.checksum() {
+ pkg.manifest_mut()
+ .summary_mut()
+ .set_checksum(cksum.to_string());
+ }
+
+ Ok(pkg)
+ }
+}
+
+impl<'cfg> Source for RegistrySource<'cfg> {
+ fn query(
+ &mut self,
+ dep: &Dependency,
+ kind: QueryKind,
+ f: &mut dyn FnMut(Summary),
+ ) -> Poll<CargoResult<()>> {
+ // If this is a precise dependency, then it came from a lock file and in
+ // theory the registry is known to contain this version. If, however, we
+ // come back with no summaries, then our registry may need to be
+ // updated, so we fall back to performing a lazy update.
+ if kind == QueryKind::Exact && dep.source_id().precise().is_some() && !self.ops.is_updated()
+ {
+ debug!("attempting query without update");
+ let mut called = false;
+ let pend =
+ self.index
+ .query_inner(dep, &mut *self.ops, &self.yanked_whitelist, &mut |s| {
+ if dep.matches(&s) {
+ called = true;
+ f(s);
+ }
+ })?;
+ if pend.is_pending() {
+ return Poll::Pending;
+ }
+ if called {
+ return Poll::Ready(Ok(()));
+ } else {
+ debug!("falling back to an update");
+ self.invalidate_cache();
+ return Poll::Pending;
+ }
+ }
+
+ self.index
+ .query_inner(dep, &mut *self.ops, &self.yanked_whitelist, &mut |s| {
+ let matched = match kind {
+ QueryKind::Exact => dep.matches(&s),
+ QueryKind::Fuzzy => true,
+ };
+ if matched {
+ f(s);
+ }
+ })
+ }
+
+ fn supports_checksums(&self) -> bool {
+ true
+ }
+
+ fn requires_precise(&self) -> bool {
+ false
+ }
+
+ fn source_id(&self) -> SourceId {
+ self.source_id
+ }
+
+ fn invalidate_cache(&mut self) {
+ self.index.clear_summaries_cache();
+ self.ops.invalidate_cache();
+ }
+
+ fn download(&mut self, package: PackageId) -> CargoResult<MaybePackage> {
+ let hash = loop {
+ match self.index.hash(package, &mut *self.ops)? {
+ Poll::Pending => self.block_until_ready()?,
+ Poll::Ready(hash) => break hash,
+ }
+ };
+ match self.ops.download(package, hash)? {
+ MaybeLock::Ready(file) => self.get_pkg(package, &file).map(MaybePackage::Ready),
+ MaybeLock::Download {
+ url,
+ descriptor,
+ authorization,
+ } => Ok(MaybePackage::Download {
+ url,
+ descriptor,
+ authorization,
+ }),
+ }
+ }
+
+ fn finish_download(&mut self, package: PackageId, data: Vec<u8>) -> CargoResult<Package> {
+ let hash = loop {
+ match self.index.hash(package, &mut *self.ops)? {
+ Poll::Pending => self.block_until_ready()?,
+ Poll::Ready(hash) => break hash,
+ }
+ };
+ let file = self.ops.finish_download(package, hash, &data)?;
+ self.get_pkg(package, &file)
+ }
+
+ fn fingerprint(&self, pkg: &Package) -> CargoResult<String> {
+ Ok(pkg.package_id().version().to_string())
+ }
+
+ fn describe(&self) -> String {
+ self.source_id.display_index()
+ }
+
+ fn add_to_yanked_whitelist(&mut self, pkgs: &[PackageId]) {
+ self.yanked_whitelist.extend(pkgs);
+ }
+
+ fn is_yanked(&mut self, pkg: PackageId) -> Poll<CargoResult<bool>> {
+ self.index.is_yanked(pkg, &mut *self.ops)
+ }
+
+ fn block_until_ready(&mut self) -> CargoResult<()> {
+ // Before starting to work on the registry, make sure that
+ // `<cargo_home>/registry` is marked as excluded from indexing and
+ // backups. Older versions of Cargo didn't do this, so we do it here
+ // regardless of whether `<cargo_home>` exists.
+ //
+ // This does not use `create_dir_all_excluded_from_backups_atomic` for
+ // the same reason: we want to exclude it even if the directory already
+ // exists.
+ //
+ // IO errors in creating and marking it are ignored, e.g. in case we're on a
+ // read-only filesystem.
+ let registry_base = self.config.registry_base_path();
+ let _ = registry_base.create_dir();
+ exclude_from_backups_and_indexing(&registry_base.into_path_unlocked());
+
+ self.ops.block_until_ready()
+ }
+}
+
+/// Get the maximum upack size that Cargo permits
+/// based on a given `size of your compressed file.
+///
+/// Returns the larger one between `size * max compression ratio`
+/// and a fixed max unpacked size.
+///
+/// In reality, the compression ratio usually falls in the range of 2:1 to 10:1.
+/// We choose 20:1 to cover almost all possible cases hopefully.
+/// Any ratio higher than this is considered as a zip bomb.
+///
+/// In the future we might want to introduce a configurable size.
+///
+/// Some of the real world data from common compression algorithms:
+///
+/// * <https://www.zlib.net/zlib_tech.html>
+/// * <https://cran.r-project.org/web/packages/brotli/vignettes/brotli-2015-09-22.pdf>
+/// * <https://blog.cloudflare.com/results-experimenting-brotli/>
+/// * <https://tukaani.org/lzma/benchmarks.html>
+fn max_unpack_size(config: &Config, size: u64) -> u64 {
+ const SIZE_VAR: &str = "__CARGO_TEST_MAX_UNPACK_SIZE";
+ const RATIO_VAR: &str = "__CARGO_TEST_MAX_UNPACK_RATIO";
+ let max_unpack_size = if cfg!(debug_assertions) && config.get_env(SIZE_VAR).is_ok() {
+ // For integration test only.
+ config
+ .get_env(SIZE_VAR)
+ .unwrap()
+ .parse()
+ .expect("a max unpack size in bytes")
+ } else {
+ MAX_UNPACK_SIZE
+ };
+ let max_compression_ratio = if cfg!(debug_assertions) && config.get_env(RATIO_VAR).is_ok() {
+ // For integration test only.
+ config
+ .get_env(RATIO_VAR)
+ .unwrap()
+ .parse()
+ .expect("a max compression ratio in bytes")
+ } else {
+ MAX_COMPRESSION_RATIO
+ };
+
+ u64::max(max_unpack_size, size * max_compression_ratio as u64)
+}
+
+fn make_dep_prefix(name: &str) -> String {
+ match name.len() {
+ 1 => String::from("1"),
+ 2 => String::from("2"),
+ 3 => format!("3/{}", &name[..1]),
+ _ => format!("{}/{}", &name[0..2], &name[2..4]),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::make_dep_prefix;
+
+ #[test]
+ fn dep_prefix() {
+ assert_eq!(make_dep_prefix("a"), "1");
+ assert_eq!(make_dep_prefix("ab"), "2");
+ assert_eq!(make_dep_prefix("abc"), "3/a");
+ assert_eq!(make_dep_prefix("Abc"), "3/A");
+ assert_eq!(make_dep_prefix("AbCd"), "Ab/Cd");
+ assert_eq!(make_dep_prefix("aBcDe"), "aB/cD");
+ }
+}
diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs
new file mode 100644
index 0000000..aa0ec90
--- /dev/null
+++ b/src/cargo/sources/registry/remote.rs
@@ -0,0 +1,358 @@
+use crate::core::{GitReference, PackageId, SourceId};
+use crate::sources::git;
+use crate::sources::registry::download;
+use crate::sources::registry::MaybeLock;
+use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData};
+use crate::util::errors::CargoResult;
+use crate::util::interning::InternedString;
+use crate::util::{Config, Filesystem};
+use anyhow::Context as _;
+use cargo_util::paths;
+use lazycell::LazyCell;
+use log::{debug, trace};
+use std::cell::{Cell, Ref, RefCell};
+use std::fs::File;
+use std::mem;
+use std::path::Path;
+use std::str;
+use std::task::{ready, Poll};
+
+/// A remote registry is a registry that lives at a remote URL (such as
+/// crates.io). The git index is cloned locally, and `.crate` files are
+/// downloaded as needed and cached locally.
+pub struct RemoteRegistry<'cfg> {
+ index_path: Filesystem,
+ /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/path/$REG-HASH`).
+ cache_path: Filesystem,
+ source_id: SourceId,
+ index_git_ref: GitReference,
+ config: &'cfg Config,
+ tree: RefCell<Option<git2::Tree<'static>>>,
+ repo: LazyCell<git2::Repository>,
+ head: Cell<Option<git2::Oid>>,
+ current_sha: Cell<Option<InternedString>>,
+ needs_update: bool, // Does this registry need to be updated?
+}
+
+impl<'cfg> RemoteRegistry<'cfg> {
+ pub fn new(source_id: SourceId, config: &'cfg Config, name: &str) -> RemoteRegistry<'cfg> {
+ RemoteRegistry {
+ index_path: config.registry_index_path().join(name),
+ cache_path: config.registry_cache_path().join(name),
+ source_id,
+ config,
+ // TODO: we should probably make this configurable
+ index_git_ref: GitReference::DefaultBranch,
+ tree: RefCell::new(None),
+ repo: LazyCell::new(),
+ head: Cell::new(None),
+ current_sha: Cell::new(None),
+ needs_update: false,
+ }
+ }
+
+ fn repo(&self) -> CargoResult<&git2::Repository> {
+ self.repo.try_borrow_with(|| {
+ let path = self.config.assert_package_cache_locked(&self.index_path);
+
+ // Fast path without a lock
+ if let Ok(repo) = git2::Repository::open(&path) {
+ trace!("opened a repo without a lock");
+ return Ok(repo);
+ }
+
+ // Ok, now we need to lock and try the whole thing over again.
+ trace!("acquiring registry index lock");
+ match git2::Repository::open(&path) {
+ Ok(repo) => Ok(repo),
+ Err(_) => {
+ drop(paths::remove_dir_all(&path));
+ paths::create_dir_all(&path)?;
+
+ // Note that we'd actually prefer to use a bare repository
+ // here as we're not actually going to check anything out.
+ // All versions of Cargo, though, share the same CARGO_HOME,
+ // so for compatibility with older Cargo which *does* do
+ // checkouts we make sure to initialize a new full
+ // repository (not a bare one).
+ //
+ // We should change this to `init_bare` whenever we feel
+ // like enough time has passed or if we change the directory
+ // that the folder is located in, such as by changing the
+ // hash at the end of the directory.
+ //
+ // Note that in the meantime we also skip `init.templatedir`
+ // as it can be misconfigured sometimes or otherwise add
+ // things that we don't want.
+ let mut opts = git2::RepositoryInitOptions::new();
+ opts.external_template(false);
+ Ok(git2::Repository::init_opts(&path, &opts).with_context(|| {
+ format!("failed to initialize index git repository (in {:?})", path)
+ })?)
+ }
+ }
+ })
+ }
+
+ fn head(&self) -> CargoResult<git2::Oid> {
+ if self.head.get().is_none() {
+ let repo = self.repo()?;
+ let oid = self.index_git_ref.resolve(repo)?;
+ self.head.set(Some(oid));
+ }
+ Ok(self.head.get().unwrap())
+ }
+
+ fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
+ {
+ let tree = self.tree.borrow();
+ if tree.is_some() {
+ return Ok(Ref::map(tree, |s| s.as_ref().unwrap()));
+ }
+ }
+ let repo = self.repo()?;
+ let commit = repo.find_commit(self.head()?)?;
+ let tree = commit.tree()?;
+
+ // Unfortunately in libgit2 the tree objects look like they've got a
+ // reference to the repository object which means that a tree cannot
+ // outlive the repository that it came from. Here we want to cache this
+ // tree, though, so to accomplish this we transmute it to a static
+ // lifetime.
+ //
+ // Note that we don't actually hand out the static lifetime, instead we
+ // only return a scoped one from this function. Additionally the repo
+ // we loaded from (above) lives as long as this object
+ // (`RemoteRegistry`) so we then just need to ensure that the tree is
+ // destroyed first in the destructor, hence the destructor on
+ // `RemoteRegistry` below.
+ let tree = unsafe { mem::transmute::<git2::Tree<'_>, git2::Tree<'static>>(tree) };
+ *self.tree.borrow_mut() = Some(tree);
+ Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap()))
+ }
+
+ fn current_version(&self) -> Option<InternedString> {
+ if let Some(sha) = self.current_sha.get() {
+ return Some(sha);
+ }
+ let sha = InternedString::new(&self.head().ok()?.to_string());
+ self.current_sha.set(Some(sha));
+ Some(sha)
+ }
+
+ fn is_updated(&self) -> bool {
+ self.config.updated_sources().contains(&self.source_id)
+ }
+
+ fn mark_updated(&self) {
+ self.config.updated_sources().insert(self.source_id);
+ }
+}
+
+const LAST_UPDATED_FILE: &str = ".last-updated";
+
+impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
+ fn prepare(&self) -> CargoResult<()> {
+ self.repo()?; // create intermediate dirs and initialize the repo
+ Ok(())
+ }
+
+ fn index_path(&self) -> &Filesystem {
+ &self.index_path
+ }
+
+ fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path {
+ self.config.assert_package_cache_locked(path)
+ }
+
+ // `index_version` Is a string representing the version of the file used to construct the cached copy.
+ // Older versions of Cargo used the single value of the hash of the HEAD commit as a `index_version`.
+ // This is technically correct but a little too conservative. If a new commit is fetched all cached
+ // files need to be regenerated even if a particular file was not changed.
+ // However if an old cargo has written such a file we still know how to read it, as long as we check for that hash value.
+ //
+ // Cargo now uses a hash of the file's contents as provided by git.
+ fn load(
+ &mut self,
+ _root: &Path,
+ path: &Path,
+ index_version: Option<&str>,
+ ) -> Poll<CargoResult<LoadResponse>> {
+ if self.needs_update {
+ return Poll::Pending;
+ }
+ // Check if the cache is valid.
+ let git_commit_hash = self.current_version();
+ if index_version.is_some() && index_version == git_commit_hash.as_deref() {
+ // This file was written by an old version of cargo, but it is still up-to-date.
+ return Poll::Ready(Ok(LoadResponse::CacheValid));
+ }
+ // Note that the index calls this method and the filesystem is locked
+ // in the index, so we don't need to worry about an `update_index`
+ // happening in a different process.
+ fn load_helper(
+ registry: &RemoteRegistry<'_>,
+ path: &Path,
+ index_version: Option<&str>,
+ ) -> CargoResult<LoadResponse> {
+ let repo = registry.repo()?;
+ let tree = registry.tree()?;
+ let entry = tree.get_path(path);
+ let entry = entry?;
+ let git_file_hash = Some(entry.id().to_string());
+
+ // Check if the cache is valid.
+ if index_version.is_some() && index_version == git_file_hash.as_deref() {
+ return Ok(LoadResponse::CacheValid);
+ }
+
+ let object = entry.to_object(repo)?;
+ let blob = match object.as_blob() {
+ Some(blob) => blob,
+ None => anyhow::bail!("path `{}` is not a blob in the git repo", path.display()),
+ };
+
+ Ok(LoadResponse::Data {
+ raw_data: blob.content().to_vec(),
+ index_version: git_file_hash,
+ })
+ }
+
+ match load_helper(&self, path, index_version) {
+ Ok(result) => Poll::Ready(Ok(result)),
+ Err(_) if !self.is_updated() => {
+ // If git returns an error and we haven't updated the repo, return
+ // pending to allow an update to try again.
+ self.needs_update = true;
+ Poll::Pending
+ }
+ Err(e)
+ if e.downcast_ref::<git2::Error>()
+ .map(|e| e.code() == git2::ErrorCode::NotFound)
+ .unwrap_or_default() =>
+ {
+ // The repo has been updated and the file does not exist.
+ Poll::Ready(Ok(LoadResponse::NotFound))
+ }
+ Err(e) => Poll::Ready(Err(e)),
+ }
+ }
+
+ fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> {
+ debug!("loading config");
+ self.prepare()?;
+ self.config.assert_package_cache_locked(&self.index_path);
+ match ready!(self.load(Path::new(""), Path::new("config.json"), None)?) {
+ LoadResponse::Data { raw_data, .. } => {
+ trace!("config loaded");
+ let mut cfg: RegistryConfig = serde_json::from_slice(&raw_data)?;
+ if !self.config.cli_unstable().registry_auth {
+ cfg.auth_required = false;
+ }
+ Poll::Ready(Ok(Some(cfg)))
+ }
+ _ => Poll::Ready(Ok(None)),
+ }
+ }
+
+ fn block_until_ready(&mut self) -> CargoResult<()> {
+ if !self.needs_update {
+ return Ok(());
+ }
+
+ self.needs_update = false;
+
+ // Make sure the index is only updated once per session since it is an
+ // expensive operation. This generally only happens when the resolver
+ // is run multiple times, such as during `cargo publish`.
+ if self.is_updated() {
+ return Ok(());
+ }
+ self.mark_updated();
+
+ if self.config.offline() {
+ return Ok(());
+ }
+ if self.config.cli_unstable().no_index_update {
+ return Ok(());
+ }
+
+ debug!("updating the index");
+
+ // Ensure that we'll actually be able to acquire an HTTP handle later on
+ // once we start trying to download crates. This will weed out any
+ // problems with `.cargo/config` configuration related to HTTP.
+ //
+ // This way if there's a problem the error gets printed before we even
+ // hit the index, which may not actually read this configuration.
+ self.config.http()?;
+
+ self.prepare()?;
+ self.head.set(None);
+ *self.tree.borrow_mut() = None;
+ self.current_sha.set(None);
+ let path = self.config.assert_package_cache_locked(&self.index_path);
+ self.config
+ .shell()
+ .status("Updating", self.source_id.display_index())?;
+
+ // Fetch the latest version of our `index_git_ref` into the index
+ // checkout.
+ let url = self.source_id.url();
+ let repo = self.repo.borrow_mut().unwrap();
+ git::fetch(repo, url.as_str(), &self.index_git_ref, self.config)
+ .with_context(|| format!("failed to fetch `{}`", url))?;
+
+ // Create a dummy file to record the mtime for when we updated the
+ // index.
+ paths::create(&path.join(LAST_UPDATED_FILE))?;
+
+ Ok(())
+ }
+
+ fn invalidate_cache(&mut self) {
+ // To fully invalidate, undo `mark_updated`s work
+ self.needs_update = true;
+ }
+
+ fn is_updated(&self) -> bool {
+ self.is_updated()
+ }
+
+ fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> {
+ let registry_config = loop {
+ match self.config()? {
+ Poll::Pending => self.block_until_ready()?,
+ Poll::Ready(cfg) => break cfg.unwrap(),
+ }
+ };
+
+ download::download(
+ &self.cache_path,
+ &self.config,
+ pkg,
+ checksum,
+ registry_config,
+ )
+ }
+
+ fn finish_download(
+ &mut self,
+ pkg: PackageId,
+ checksum: &str,
+ data: &[u8],
+ ) -> CargoResult<File> {
+ download::finish_download(&self.cache_path, &self.config, pkg, checksum, data)
+ }
+
+ fn is_crate_downloaded(&self, pkg: PackageId) -> bool {
+ download::is_crate_downloaded(&self.cache_path, &self.config, pkg)
+ }
+}
+
+impl<'cfg> Drop for RemoteRegistry<'cfg> {
+ fn drop(&mut self) {
+ // Just be sure to drop this before our other fields
+ self.tree.borrow_mut().take();
+ }
+}