diff options
Diffstat (limited to '')
-rw-r--r-- | src/cargo/sources/registry/download.rs | 130 | ||||
-rw-r--r-- | src/cargo/sources/registry/http_remote.rs | 802 | ||||
-rw-r--r-- | src/cargo/sources/registry/index.rs | 889 | ||||
-rw-r--r-- | src/cargo/sources/registry/local.rs | 149 | ||||
-rw-r--r-- | src/cargo/sources/registry/mod.rs | 970 | ||||
-rw-r--r-- | src/cargo/sources/registry/remote.rs | 358 |
6 files changed, 3298 insertions, 0 deletions
diff --git a/src/cargo/sources/registry/download.rs b/src/cargo/sources/registry/download.rs new file mode 100644 index 0000000..723c55f --- /dev/null +++ b/src/cargo/sources/registry/download.rs @@ -0,0 +1,130 @@ +use anyhow::Context; +use cargo_util::Sha256; + +use crate::core::PackageId; +use crate::sources::registry::make_dep_prefix; +use crate::sources::registry::MaybeLock; +use crate::sources::registry::{ + RegistryConfig, CHECKSUM_TEMPLATE, CRATE_TEMPLATE, LOWER_PREFIX_TEMPLATE, PREFIX_TEMPLATE, + VERSION_TEMPLATE, +}; +use crate::util::auth; +use crate::util::errors::CargoResult; +use crate::util::{Config, Filesystem}; +use std::fmt::Write as FmtWrite; +use std::fs::{self, File, OpenOptions}; +use std::io::prelude::*; +use std::io::SeekFrom; +use std::str; + +pub(super) fn filename(pkg: PackageId) -> String { + format!("{}-{}.crate", pkg.name(), pkg.version()) +} + +pub(super) fn download( + cache_path: &Filesystem, + config: &Config, + pkg: PackageId, + checksum: &str, + registry_config: RegistryConfig, +) -> CargoResult<MaybeLock> { + let filename = filename(pkg); + let path = cache_path.join(&filename); + let path = config.assert_package_cache_locked(&path); + + // Attempt to open a read-only copy first to avoid an exclusive write + // lock and also work with read-only filesystems. Note that we check the + // length of the file like below to handle interrupted downloads. + // + // If this fails then we fall through to the exclusive path where we may + // have to redownload the file. + if let Ok(dst) = File::open(path) { + let meta = dst.metadata()?; + if meta.len() > 0 { + return Ok(MaybeLock::Ready(dst)); + } + } + + let mut url = registry_config.dl; + if !url.contains(CRATE_TEMPLATE) + && !url.contains(VERSION_TEMPLATE) + && !url.contains(PREFIX_TEMPLATE) + && !url.contains(LOWER_PREFIX_TEMPLATE) + && !url.contains(CHECKSUM_TEMPLATE) + { + // Original format before customizing the download URL was supported. + write!( + url, + "/{}/{}/download", + pkg.name(), + pkg.version().to_string() + ) + .unwrap(); + } else { + let prefix = make_dep_prefix(&*pkg.name()); + url = url + .replace(CRATE_TEMPLATE, &*pkg.name()) + .replace(VERSION_TEMPLATE, &pkg.version().to_string()) + .replace(PREFIX_TEMPLATE, &prefix) + .replace(LOWER_PREFIX_TEMPLATE, &prefix.to_lowercase()) + .replace(CHECKSUM_TEMPLATE, checksum); + } + + let authorization = if registry_config.auth_required { + Some(auth::auth_token(config, &pkg.source_id(), None, None)?) + } else { + None + }; + + Ok(MaybeLock::Download { + url, + descriptor: pkg.to_string(), + authorization: authorization, + }) +} + +pub(super) fn finish_download( + cache_path: &Filesystem, + config: &Config, + pkg: PackageId, + checksum: &str, + data: &[u8], +) -> CargoResult<File> { + // Verify what we just downloaded + let actual = Sha256::new().update(data).finish_hex(); + if actual != checksum { + anyhow::bail!("failed to verify the checksum of `{}`", pkg) + } + + let filename = filename(pkg); + cache_path.create_dir()?; + let path = cache_path.join(&filename); + let path = config.assert_package_cache_locked(&path); + let mut dst = OpenOptions::new() + .create(true) + .read(true) + .write(true) + .open(&path) + .with_context(|| format!("failed to open `{}`", path.display()))?; + let meta = dst.metadata()?; + if meta.len() > 0 { + return Ok(dst); + } + + dst.write_all(data)?; + dst.seek(SeekFrom::Start(0))?; + Ok(dst) +} + +pub(super) fn is_crate_downloaded( + cache_path: &Filesystem, + config: &Config, + pkg: PackageId, +) -> bool { + let path = cache_path.join(filename(pkg)); + let path = config.assert_package_cache_locked(&path); + if let Ok(meta) = fs::metadata(path) { + return meta.len() > 0; + } + false +} diff --git a/src/cargo/sources/registry/http_remote.rs b/src/cargo/sources/registry/http_remote.rs new file mode 100644 index 0000000..c3bcadf --- /dev/null +++ b/src/cargo/sources/registry/http_remote.rs @@ -0,0 +1,802 @@ +//! Access to a HTTP-based crate registry. +//! +//! See [`HttpRegistry`] for details. + +use crate::core::{PackageId, SourceId}; +use crate::ops::{self}; +use crate::sources::registry::download; +use crate::sources::registry::MaybeLock; +use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData}; +use crate::util::errors::{CargoResult, HttpNotSuccessful}; +use crate::util::network::Retry; +use crate::util::{auth, Config, Filesystem, IntoUrl, Progress, ProgressStyle}; +use anyhow::Context; +use cargo_util::paths; +use curl::easy::{HttpVersion, List}; +use curl::multi::{EasyHandle, Multi}; +use log::{debug, trace, warn}; +use std::cell::RefCell; +use std::collections::{HashMap, HashSet}; +use std::fs::{self, File}; +use std::io::ErrorKind; +use std::path::{Path, PathBuf}; +use std::str; +use std::task::{ready, Poll}; +use std::time::Duration; +use url::Url; + +// HTTP headers +const ETAG: &'static str = "etag"; +const LAST_MODIFIED: &'static str = "last-modified"; +const WWW_AUTHENTICATE: &'static str = "www-authenticate"; +const IF_NONE_MATCH: &'static str = "if-none-match"; +const IF_MODIFIED_SINCE: &'static str = "if-modified-since"; + +const UNKNOWN: &'static str = "Unknown"; + +/// A registry served by the HTTP-based registry API. +/// +/// This type is primarily accessed through the [`RegistryData`] trait. +/// +/// `HttpRegistry` implements the HTTP-based registry API outlined in [RFC 2789]. Read the RFC for +/// the complete protocol, but _roughly_ the implementation loads each index file (e.g., +/// config.json or re/ge/regex) from an HTTP service rather than from a locally cloned git +/// repository. The remote service can more or less be a static file server that simply serves the +/// contents of the origin git repository. +/// +/// Implemented naively, this leads to a significant amount of network traffic, as a lookup of any +/// index file would need to check with the remote backend if the index file has changed. This +/// cost is somewhat mitigated by the use of HTTP conditional fetches (`If-Modified-Since` and +/// `If-None-Match` for `ETag`s) which can be efficiently handled by HTTP/2. +/// +/// [RFC 2789]: https://github.com/rust-lang/rfcs/pull/2789 +pub struct HttpRegistry<'cfg> { + index_path: Filesystem, + cache_path: Filesystem, + source_id: SourceId, + config: &'cfg Config, + + /// Store the server URL without the protocol prefix (sparse+) + url: Url, + + /// HTTP multi-handle for asynchronous/parallel requests. + multi: Multi, + + /// Has the client requested a cache update? + /// + /// Only if they have do we double-check the freshness of each locally-stored index file. + requested_update: bool, + + /// State for currently pending index downloads. + downloads: Downloads<'cfg>, + + /// Does the config say that we can use HTTP multiplexing? + multiplexing: bool, + + /// What paths have we already fetched since the last index update? + /// + /// We do not need to double-check any of these index files since we have already done so. + fresh: HashSet<PathBuf>, + + /// Have we started to download any index files? + fetch_started: bool, + + /// Cached registry configuration. + registry_config: Option<RegistryConfig>, + + /// Should we include the authorization header? + auth_required: bool, + + /// Url to get a token for the registry. + login_url: Option<Url>, +} + +/// Helper for downloading crates. +pub struct Downloads<'cfg> { + /// When a download is started, it is added to this map. The key is a + /// "token" (see `Download::token`). It is removed once the download is + /// finished. + pending: HashMap<usize, (Download<'cfg>, EasyHandle)>, + /// Set of paths currently being downloaded. + /// This should stay in sync with `pending`. + pending_paths: HashSet<PathBuf>, + /// The final result of each download. + results: HashMap<PathBuf, CargoResult<CompletedDownload>>, + /// The next ID to use for creating a token (see `Download::token`). + next: usize, + /// Progress bar. + progress: RefCell<Option<Progress<'cfg>>>, + /// Number of downloads that have successfully finished. + downloads_finished: usize, + /// Number of times the caller has requested blocking. This is used for + /// an estimate of progress. + blocking_calls: usize, +} + +struct Download<'cfg> { + /// The token for this download, used as the key of the `Downloads::pending` map + /// and stored in `EasyHandle` as well. + token: usize, + + /// The path of the package that we're downloading. + path: PathBuf, + + /// Actual downloaded data, updated throughout the lifetime of this download. + data: RefCell<Vec<u8>>, + + /// HTTP headers. + header_map: RefCell<Headers>, + + /// Logic used to track retrying this download if it's a spurious failure. + retry: Retry<'cfg>, +} + +#[derive(Default)] +struct Headers { + last_modified: Option<String>, + etag: Option<String>, + www_authenticate: Vec<String>, +} + +enum StatusCode { + Success, + NotModified, + NotFound, + Unauthorized, +} + +struct CompletedDownload { + response_code: StatusCode, + data: Vec<u8>, + header_map: Headers, +} + +impl<'cfg> HttpRegistry<'cfg> { + pub fn new( + source_id: SourceId, + config: &'cfg Config, + name: &str, + ) -> CargoResult<HttpRegistry<'cfg>> { + let url = source_id.url().as_str(); + // Ensure the url ends with a slash so we can concatenate paths. + if !url.ends_with('/') { + anyhow::bail!("sparse registry url must end in a slash `/`: {url}") + } + assert!(source_id.is_sparse()); + let url = url + .strip_prefix("sparse+") + .expect("sparse registry needs sparse+ prefix") + .into_url() + .expect("a url with the sparse+ stripped should still be valid"); + + Ok(HttpRegistry { + index_path: config.registry_index_path().join(name), + cache_path: config.registry_cache_path().join(name), + source_id, + config, + url, + multi: Multi::new(), + multiplexing: false, + downloads: Downloads { + next: 0, + pending: HashMap::new(), + pending_paths: HashSet::new(), + results: HashMap::new(), + progress: RefCell::new(Some(Progress::with_style( + "Fetch", + ProgressStyle::Indeterminate, + config, + ))), + downloads_finished: 0, + blocking_calls: 0, + }, + fresh: HashSet::new(), + requested_update: false, + fetch_started: false, + registry_config: None, + auth_required: false, + login_url: None, + }) + } + + fn handle_http_header(buf: &[u8]) -> Option<(&str, &str)> { + if buf.is_empty() { + return None; + } + let buf = std::str::from_utf8(buf).ok()?.trim_end(); + // Don't let server sneak extra lines anywhere. + if buf.contains('\n') { + return None; + } + let (tag, value) = buf.split_once(':')?; + let value = value.trim(); + Some((tag, value)) + } + + fn start_fetch(&mut self) -> CargoResult<()> { + if self.fetch_started { + // We only need to run the setup code once. + return Ok(()); + } + self.fetch_started = true; + + // We've enabled the `http2` feature of `curl` in Cargo, so treat + // failures here as fatal as it would indicate a build-time problem. + self.multiplexing = self.config.http_config()?.multiplexing.unwrap_or(true); + + self.multi + .pipelining(false, self.multiplexing) + .with_context(|| "failed to enable multiplexing/pipelining in curl")?; + + // let's not flood the server with connections + self.multi.set_max_host_connections(2)?; + + self.config + .shell() + .status("Updating", self.source_id.display_index())?; + + Ok(()) + } + + fn handle_completed_downloads(&mut self) -> CargoResult<()> { + assert_eq!( + self.downloads.pending.len(), + self.downloads.pending_paths.len() + ); + + // Collect the results from the Multi handle. + let results = { + let mut results = Vec::new(); + let pending = &mut self.downloads.pending; + self.multi.messages(|msg| { + let token = msg.token().expect("failed to read token"); + let (_, handle) = &pending[&token]; + if let Some(result) = msg.result_for(handle) { + results.push((token, result)); + }; + }); + results + }; + for (token, result) in results { + let (mut download, handle) = self.downloads.pending.remove(&token).unwrap(); + let mut handle = self.multi.remove(handle)?; + let data = download.data.take(); + let url = self.full_url(&download.path); + let result = match download.retry.r#try(|| { + result.with_context(|| format!("failed to download from `{}`", url))?; + let code = handle.response_code()?; + // Keep this list of expected status codes in sync with the codes handled in `load` + let code = match code { + 200 => StatusCode::Success, + 304 => StatusCode::NotModified, + 401 => StatusCode::Unauthorized, + 404 | 410 | 451 => StatusCode::NotFound, + code => { + let url = handle.effective_url()?.unwrap_or(&url); + return Err(HttpNotSuccessful { + code, + url: url.to_owned(), + body: data, + } + .into()); + } + }; + Ok((data, code)) + }) { + Ok(Some((data, code))) => Ok(CompletedDownload { + response_code: code, + data, + header_map: download.header_map.take(), + }), + Ok(None) => { + // retry the operation + let handle = self.multi.add(handle)?; + self.downloads.pending.insert(token, (download, handle)); + continue; + } + Err(e) => Err(e), + }; + + assert!(self.downloads.pending_paths.remove(&download.path)); + self.downloads.results.insert(download.path, result); + self.downloads.downloads_finished += 1; + } + + self.downloads.tick()?; + + Ok(()) + } + + fn full_url(&self, path: &Path) -> String { + // self.url always ends with a slash. + format!("{}{}", self.url, path.display()) + } + + fn is_fresh(&self, path: &Path) -> bool { + if !self.requested_update { + trace!( + "using local {} as user did not request update", + path.display() + ); + true + } else if self.config.cli_unstable().no_index_update { + trace!("using local {} in no_index_update mode", path.display()); + true + } else if self.config.offline() { + trace!("using local {} in offline mode", path.display()); + true + } else if self.fresh.contains(path) { + trace!("using local {} as it was already fetched", path.display()); + true + } else { + debug!("checking freshness of {}", path.display()); + false + } + } + + /// Get the cached registry configuration, if it exists. + fn config_cached(&mut self) -> CargoResult<Option<&RegistryConfig>> { + if self.registry_config.is_some() { + return Ok(self.registry_config.as_ref()); + } + let config_json_path = self + .assert_index_locked(&self.index_path) + .join("config.json"); + match fs::read(&config_json_path) { + Ok(raw_data) => match serde_json::from_slice(&raw_data) { + Ok(json) => { + self.registry_config = Some(json); + } + Err(e) => log::debug!("failed to decode cached config.json: {}", e), + }, + Err(e) => { + if e.kind() != ErrorKind::NotFound { + log::debug!("failed to read config.json cache: {}", e) + } + } + } + Ok(self.registry_config.as_ref()) + } + + /// Get the registry configuration. + fn config(&mut self) -> Poll<CargoResult<&RegistryConfig>> { + debug!("loading config"); + let index_path = self.assert_index_locked(&self.index_path); + let config_json_path = index_path.join("config.json"); + if self.is_fresh(Path::new("config.json")) && self.config_cached()?.is_some() { + return Poll::Ready(Ok(self.registry_config.as_ref().unwrap())); + } + + match ready!(self.load(Path::new(""), Path::new("config.json"), None)?) { + LoadResponse::Data { + raw_data, + index_version: _, + } => { + trace!("config loaded"); + self.registry_config = Some(serde_json::from_slice(&raw_data)?); + if paths::create_dir_all(&config_json_path.parent().unwrap()).is_ok() { + if let Err(e) = fs::write(&config_json_path, &raw_data) { + log::debug!("failed to write config.json cache: {}", e); + } + } + Poll::Ready(Ok(self.registry_config.as_ref().unwrap())) + } + LoadResponse::NotFound => { + Poll::Ready(Err(anyhow::anyhow!("config.json not found in registry"))) + } + LoadResponse::CacheValid => Poll::Ready(Err(crate::util::internal( + "config.json is never stored in the index cache", + ))), + } + } +} + +impl<'cfg> RegistryData for HttpRegistry<'cfg> { + fn prepare(&self) -> CargoResult<()> { + Ok(()) + } + + fn index_path(&self) -> &Filesystem { + &self.index_path + } + + fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path { + self.config.assert_package_cache_locked(path) + } + + fn is_updated(&self) -> bool { + self.requested_update + } + + fn load( + &mut self, + _root: &Path, + path: &Path, + index_version: Option<&str>, + ) -> Poll<CargoResult<LoadResponse>> { + trace!("load: {}", path.display()); + if let Some(_token) = self.downloads.pending_paths.get(path) { + debug!("dependency is still pending: {}", path.display()); + return Poll::Pending; + } + + if let Some(index_version) = index_version { + trace!( + "local cache of {} is available at version `{}`", + path.display(), + index_version + ); + if self.is_fresh(path) { + return Poll::Ready(Ok(LoadResponse::CacheValid)); + } + } else if self.fresh.contains(path) { + // We have no cached copy of this file, and we already downloaded it. + debug!( + "cache did not contain previously downloaded file {}", + path.display() + ); + return Poll::Ready(Ok(LoadResponse::NotFound)); + } + + if let Some(result) = self.downloads.results.remove(path) { + let result = + result.with_context(|| format!("download of {} failed", path.display()))?; + + assert!( + self.fresh.insert(path.to_path_buf()), + "downloaded the index file `{}` twice", + path.display() + ); + + // The status handled here need to be kept in sync with the codes handled + // in `handle_completed_downloads` + match result.response_code { + StatusCode::Success => { + let response_index_version = if let Some(etag) = result.header_map.etag { + format!("{}: {}", ETAG, etag) + } else if let Some(lm) = result.header_map.last_modified { + format!("{}: {}", LAST_MODIFIED, lm) + } else { + UNKNOWN.to_string() + }; + trace!("index file version: {}", response_index_version); + return Poll::Ready(Ok(LoadResponse::Data { + raw_data: result.data, + index_version: Some(response_index_version), + })); + } + StatusCode::NotModified => { + // Not Modified: the data in the cache is still the latest. + if index_version.is_none() { + return Poll::Ready(Err(anyhow::anyhow!( + "server said not modified (HTTP 304) when no local cache exists" + ))); + } + return Poll::Ready(Ok(LoadResponse::CacheValid)); + } + StatusCode::NotFound => { + // The crate was not found or deleted from the registry. + return Poll::Ready(Ok(LoadResponse::NotFound)); + } + StatusCode::Unauthorized + if !self.auth_required + && path == Path::new("config.json") + && self.config.cli_unstable().registry_auth => + { + debug!("re-attempting request for config.json with authorization included."); + self.fresh.remove(path); + self.auth_required = true; + + // Look for a `www-authenticate` header with the `Cargo` scheme. + for header in &result.header_map.www_authenticate { + for challenge in http_auth::ChallengeParser::new(header) { + match challenge { + Ok(challenge) if challenge.scheme.eq_ignore_ascii_case("Cargo") => { + // Look for the `login_url` parameter. + for (param, value) in challenge.params { + if param.eq_ignore_ascii_case("login_url") { + self.login_url = Some(value.to_unescaped().into_url()?); + } + } + } + Ok(challenge) => { + debug!("ignoring non-Cargo challenge: {}", challenge.scheme) + } + Err(e) => debug!("failed to parse challenge: {}", e), + } + } + } + } + StatusCode::Unauthorized => { + let err = Err(HttpNotSuccessful { + code: 401, + body: result.data, + url: self.full_url(path), + } + .into()); + if self.auth_required { + return Poll::Ready(err.context(auth::AuthorizationError { + sid: self.source_id.clone(), + login_url: self.login_url.clone(), + reason: auth::AuthorizationErrorReason::TokenRejected, + })); + } else { + return Poll::Ready(err); + } + } + } + } + + if path != Path::new("config.json") { + self.auth_required = ready!(self.config()?).auth_required; + } else if !self.auth_required { + // Check if there's a cached config that says auth is required. + // This allows avoiding the initial unauthenticated request to probe. + if let Some(config) = self.config_cached()? { + self.auth_required = config.auth_required; + } + } + + if !self.config.cli_unstable().registry_auth { + self.auth_required = false; + } + + // Looks like we're going to have to do a network request. + self.start_fetch()?; + + let mut handle = ops::http_handle(self.config)?; + let full_url = self.full_url(path); + debug!("fetch {}", full_url); + handle.get(true)?; + handle.url(&full_url)?; + handle.follow_location(true)?; + + // Enable HTTP/2 if possible. + if self.multiplexing { + crate::try_old_curl!(handle.http_version(HttpVersion::V2), "HTTP2"); + } else { + handle.http_version(HttpVersion::V11)?; + } + + // This is an option to `libcurl` which indicates that if there's a + // bunch of parallel requests to the same host they all wait until the + // pipelining status of the host is known. This means that we won't + // initiate dozens of connections to crates.io, but rather only one. + // Once the main one is opened we realized that pipelining is possible + // and multiplexing is possible with static.crates.io. All in all this + // reduces the number of connections done to a more manageable state. + crate::try_old_curl!(handle.pipewait(true), "pipewait"); + + let mut headers = List::new(); + // Include a header to identify the protocol. This allows the server to + // know that Cargo is attempting to use the sparse protocol. + headers.append("cargo-protocol: version=1")?; + headers.append("accept: text/plain")?; + + // If we have a cached copy of the file, include IF_NONE_MATCH or IF_MODIFIED_SINCE header. + if let Some(index_version) = index_version { + if let Some((key, value)) = index_version.split_once(':') { + match key { + ETAG => headers.append(&format!("{}: {}", IF_NONE_MATCH, value.trim()))?, + LAST_MODIFIED => { + headers.append(&format!("{}: {}", IF_MODIFIED_SINCE, value.trim()))? + } + _ => debug!("unexpected index version: {}", index_version), + } + } + } + if self.auth_required { + let authorization = + auth::auth_token(self.config, &self.source_id, self.login_url.as_ref(), None)?; + headers.append(&format!("Authorization: {}", authorization))?; + trace!("including authorization for {}", full_url); + } + handle.http_headers(headers)?; + + // We're going to have a bunch of downloads all happening "at the same time". + // So, we need some way to track what headers/data/responses are for which request. + // We do that through this token. Each request (and associated response) gets one. + let token = self.downloads.next; + self.downloads.next += 1; + debug!("downloading {} as {}", path.display(), token); + assert!( + self.downloads.pending_paths.insert(path.to_path_buf()), + "path queued for download more than once" + ); + + // Each write should go to self.downloads.pending[&token].data. + // Since the write function must be 'static, we access downloads through a thread-local. + // That thread-local is set up in `block_until_ready` when it calls self.multi.perform, + // which is what ultimately calls this method. + handle.write_function(move |buf| { + trace!("{} - {} bytes of data", token, buf.len()); + tls::with(|downloads| { + if let Some(downloads) = downloads { + downloads.pending[&token] + .0 + .data + .borrow_mut() + .extend_from_slice(buf); + } + }); + Ok(buf.len()) + })?; + + // And ditto for the header function. + handle.header_function(move |buf| { + if let Some((tag, value)) = Self::handle_http_header(buf) { + tls::with(|downloads| { + if let Some(downloads) = downloads { + let mut header_map = downloads.pending[&token].0.header_map.borrow_mut(); + match tag.to_ascii_lowercase().as_str() { + LAST_MODIFIED => header_map.last_modified = Some(value.to_string()), + ETAG => header_map.etag = Some(value.to_string()), + WWW_AUTHENTICATE => header_map.www_authenticate.push(value.to_string()), + _ => {} + } + } + }); + } + + true + })?; + + let dl = Download { + token, + path: path.to_path_buf(), + data: RefCell::new(Vec::new()), + header_map: Default::default(), + retry: Retry::new(self.config)?, + }; + + // Finally add the request we've lined up to the pool of requests that cURL manages. + let mut handle = self.multi.add(handle)?; + handle.set_token(token)?; + self.downloads.pending.insert(dl.token, (dl, handle)); + + Poll::Pending + } + + fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> { + let mut cfg = ready!(self.config()?).clone(); + if !self.config.cli_unstable().registry_auth { + cfg.auth_required = false; + } + Poll::Ready(Ok(Some(cfg))) + } + + fn invalidate_cache(&mut self) { + // Actually updating the index is more or less a no-op for this implementation. + // All it does is ensure that a subsequent load will double-check files with the + // server rather than rely on a locally cached copy of the index files. + debug!("invalidated index cache"); + self.fresh.clear(); + self.requested_update = true; + } + + fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> { + let registry_config = loop { + match self.config()? { + Poll::Pending => self.block_until_ready()?, + Poll::Ready(cfg) => break cfg.to_owned(), + } + }; + download::download( + &self.cache_path, + &self.config, + pkg, + checksum, + registry_config, + ) + } + + fn finish_download( + &mut self, + pkg: PackageId, + checksum: &str, + data: &[u8], + ) -> CargoResult<File> { + download::finish_download(&self.cache_path, &self.config, pkg, checksum, data) + } + + fn is_crate_downloaded(&self, pkg: PackageId) -> bool { + download::is_crate_downloaded(&self.cache_path, &self.config, pkg) + } + + fn block_until_ready(&mut self) -> CargoResult<()> { + trace!( + "block_until_ready: {} transfers pending", + self.downloads.pending.len() + ); + self.downloads.blocking_calls += 1; + + loop { + self.handle_completed_downloads()?; + + let remaining_in_multi = tls::set(&self.downloads, || { + self.multi + .perform() + .with_context(|| "failed to perform http requests") + })?; + trace!("{} transfers remaining", remaining_in_multi); + + if remaining_in_multi == 0 { + return Ok(()); + } + + // We have no more replies to provide the caller with, + // so we need to wait until cURL has something new for us. + let timeout = self + .multi + .get_timeout()? + .unwrap_or_else(|| Duration::new(1, 0)); + self.multi + .wait(&mut [], timeout) + .with_context(|| "failed to wait on curl `Multi`")?; + } + } +} + +impl<'cfg> Downloads<'cfg> { + fn tick(&self) -> CargoResult<()> { + let mut progress = self.progress.borrow_mut(); + let progress = progress.as_mut().unwrap(); + + // Since the sparse protocol discovers dependencies as it goes, + // it's not possible to get an accurate progress indication. + // + // As an approximation, we assume that the depth of the dependency graph + // is fixed, and base the progress on how many times the caller has asked + // for blocking. If there are actually additional dependencies, the progress + // bar will get stuck. If there are fewer dependencies, it will disappear + // early. It will never go backwards. + // + // The status text also contains the number of completed & pending requests, which + // gives an better indication of forward progress. + let approximate_tree_depth = 10; + + progress.tick( + self.blocking_calls.min(approximate_tree_depth), + approximate_tree_depth + 1, + &format!( + " {} complete; {} pending", + self.downloads_finished, + self.pending.len() + ), + ) + } +} + +mod tls { + use super::Downloads; + use std::cell::Cell; + + thread_local!(static PTR: Cell<usize> = Cell::new(0)); + + pub(crate) fn with<R>(f: impl FnOnce(Option<&Downloads<'_>>) -> R) -> R { + let ptr = PTR.with(|p| p.get()); + if ptr == 0 { + f(None) + } else { + // Safety: * `ptr` is only set by `set` below which ensures the type is correct. + let ptr = unsafe { &*(ptr as *const Downloads<'_>) }; + f(Some(ptr)) + } + } + + pub(crate) fn set<R>(dl: &Downloads<'_>, f: impl FnOnce() -> R) -> R { + struct Reset<'a, T: Copy>(&'a Cell<T>, T); + + impl<'a, T: Copy> Drop for Reset<'a, T> { + fn drop(&mut self) { + self.0.set(self.1); + } + } + + PTR.with(|p| { + let _reset = Reset(p, p.get()); + p.set(dl as *const Downloads<'_> as usize); + f() + }) + } +} diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs new file mode 100644 index 0000000..633ed74 --- /dev/null +++ b/src/cargo/sources/registry/index.rs @@ -0,0 +1,889 @@ +//! Management of the index of a registry source +//! +//! This module contains management of the index and various operations, such as +//! actually parsing the index, looking for crates, etc. This is intended to be +//! abstract over remote indices (downloaded via git) and local registry indices +//! (which are all just present on the filesystem). +//! +//! ## Index Performance +//! +//! One important aspect of the index is that we want to optimize the "happy +//! path" as much as possible. Whenever you type `cargo build` Cargo will +//! *always* reparse the registry and learn about dependency information. This +//! is done because Cargo needs to learn about the upstream crates.io crates +//! that you're using and ensure that the preexisting `Cargo.lock` still matches +//! the current state of the world. +//! +//! Consequently, Cargo "null builds" (the index that Cargo adds to each build +//! itself) need to be fast when accessing the index. The primary performance +//! optimization here is to avoid parsing JSON blobs from the registry if we +//! don't need them. Most secondary optimizations are centered around removing +//! allocations and such, but avoiding parsing JSON is the #1 optimization. +//! +//! When we get queries from the resolver we're given a `Dependency`. This +//! dependency in turn has a version requirement, and with lock files that +//! already exist these version requirements are exact version requirements +//! `=a.b.c`. This means that we in theory only need to parse one line of JSON +//! per query in the registry, the one that matches version `a.b.c`. +//! +//! The crates.io index, however, is not amenable to this form of query. Instead +//! the crates.io index simply is a file where each line is a JSON blob. To +//! learn about the versions in each JSON blob we would need to parse the JSON, +//! defeating the purpose of trying to parse as little as possible. +//! +//! > Note that as a small aside even *loading* the JSON from the registry is +//! > actually pretty slow. For crates.io and remote registries we don't +//! > actually check out the git index on disk because that takes quite some +//! > time and is quite large. Instead we use `libgit2` to read the JSON from +//! > the raw git objects. This in turn can be slow (aka show up high in +//! > profiles) because libgit2 has to do deflate decompression and such. +//! +//! To solve all these issues a strategy is employed here where Cargo basically +//! creates an index into the index. The first time a package is queried about +//! (first time being for an entire computer) Cargo will load the contents +//! (slowly via libgit2) from the registry. It will then (slowly) parse every +//! single line to learn about its versions. Afterwards, however, Cargo will +//! emit a new file (a cache) which is amenable for speedily parsing in future +//! invocations. +//! +//! This cache file is currently organized by basically having the semver +//! version extracted from each JSON blob. That way Cargo can quickly and easily +//! parse all versions contained and which JSON blob they're associated with. +//! The JSON blob then doesn't actually need to get parsed unless the version is +//! parsed. +//! +//! Altogether the initial measurements of this shows a massive improvement for +//! Cargo null build performance. It's expected that the improvements earned +//! here will continue to grow over time in the sense that the previous +//! implementation (parse all lines each time) actually continues to slow down +//! over time as new versions of a crate are published. In any case when first +//! implemented a null build of Cargo itself would parse 3700 JSON blobs from +//! the registry and load 150 blobs from git. Afterwards it parses 150 JSON +//! blobs and loads 0 files git. Removing 200ms or more from Cargo's startup +//! time is certainly nothing to sneeze at! +//! +//! Note that this is just a high-level overview, there's of course lots of +//! details like invalidating caches and whatnot which are handled below, but +//! hopefully those are more obvious inline in the code itself. + +use crate::core::dependency::Dependency; +use crate::core::{PackageId, SourceId, Summary}; +use crate::sources::registry::{LoadResponse, RegistryData, RegistryPackage, INDEX_V_MAX}; +use crate::util::interning::InternedString; +use crate::util::{internal, CargoResult, Config, Filesystem, OptVersionReq, ToSemver}; +use anyhow::bail; +use cargo_util::{paths, registry::make_dep_path}; +use log::{debug, info}; +use semver::Version; +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::io::ErrorKind; +use std::path::Path; +use std::str; +use std::task::{ready, Poll}; + +/// Crates.io treats hyphen and underscores as interchangeable, but the index and old Cargo do not. +/// Therefore, the index must store uncanonicalized version of the name so old Cargo's can find it. +/// This loop tries all possible combinations of switching hyphen and underscores to find the +/// uncanonicalized one. As all stored inputs have the correct spelling, we start with the spelling +/// as-provided. +struct UncanonicalizedIter<'s> { + input: &'s str, + num_hyphen_underscore: u32, + hyphen_combination_num: u16, +} + +impl<'s> UncanonicalizedIter<'s> { + fn new(input: &'s str) -> Self { + let num_hyphen_underscore = input.chars().filter(|&c| c == '_' || c == '-').count() as u32; + UncanonicalizedIter { + input, + num_hyphen_underscore, + hyphen_combination_num: 0, + } + } +} + +impl<'s> Iterator for UncanonicalizedIter<'s> { + type Item = String; + + fn next(&mut self) -> Option<Self::Item> { + if self.hyphen_combination_num > 0 + && self.hyphen_combination_num.trailing_zeros() >= self.num_hyphen_underscore + { + return None; + } + + let ret = Some( + self.input + .chars() + .scan(0u16, |s, c| { + // the check against 15 here's to prevent + // shift overflow on inputs with more than 15 hyphens + if (c == '_' || c == '-') && *s <= 15 { + let switch = (self.hyphen_combination_num & (1u16 << *s)) > 0; + let out = if (c == '_') ^ switch { '_' } else { '-' }; + *s += 1; + Some(out) + } else { + Some(c) + } + }) + .collect(), + ); + self.hyphen_combination_num += 1; + ret + } +} + +#[test] +fn no_hyphen() { + assert_eq!( + UncanonicalizedIter::new("test").collect::<Vec<_>>(), + vec!["test".to_string()] + ) +} + +#[test] +fn two_hyphen() { + assert_eq!( + UncanonicalizedIter::new("te-_st").collect::<Vec<_>>(), + vec![ + "te-_st".to_string(), + "te__st".to_string(), + "te--st".to_string(), + "te_-st".to_string() + ] + ) +} + +#[test] +fn overflow_hyphen() { + assert_eq!( + UncanonicalizedIter::new("te-_-_-_-_-_-_-_-_-st") + .take(100) + .count(), + 100 + ) +} + +/// Manager for handling the on-disk index. +/// +/// Note that local and remote registries store the index differently. Local +/// is a simple on-disk tree of files of the raw index. Remote registries are +/// stored as a raw git repository. The different means of access are handled +/// via the [`RegistryData`] trait abstraction. +/// +/// This transparently handles caching of the index in a more efficient format. +pub struct RegistryIndex<'cfg> { + source_id: SourceId, + /// Root directory of the index for the registry. + path: Filesystem, + /// Cache of summary data. + /// + /// This is keyed off the package name. The [`Summaries`] value handles + /// loading the summary data. It keeps an optimized on-disk representation + /// of the JSON files, which is created in an as-needed fashion. If it + /// hasn't been cached already, it uses [`RegistryData::load`] to access + /// to JSON files from the index, and the creates the optimized on-disk + /// summary cache. + summaries_cache: HashMap<InternedString, Summaries>, + /// [`Config`] reference for convenience. + config: &'cfg Config, +} + +/// An internal cache of summaries for a particular package. +/// +/// A list of summaries are loaded from disk via one of two methods: +/// +/// 1. Primarily Cargo will parse the corresponding file for a crate in the +/// upstream crates.io registry. That's just a JSON blob per line which we +/// can parse, extract the version, and then store here. +/// +/// 2. Alternatively, if Cargo has previously run, we'll have a cached index of +/// dependencies for the upstream index. This is a file that Cargo maintains +/// lazily on the local filesystem and is much faster to parse since it +/// doesn't involve parsing all of the JSON. +/// +/// The outward-facing interface of this doesn't matter too much where it's +/// loaded from, but it's important when reading the implementation to note that +/// we try to parse as little as possible! +#[derive(Default)] +struct Summaries { + /// A raw vector of uninterpreted bytes. This is what `Unparsed` start/end + /// fields are indexes into. If a `Summaries` is loaded from the crates.io + /// index then this field will be empty since nothing is `Unparsed`. + raw_data: Vec<u8>, + + /// All known versions of a crate, keyed from their `Version` to the + /// possibly parsed or unparsed version of the full summary. + versions: HashMap<Version, MaybeIndexSummary>, +} + +/// A lazily parsed `IndexSummary`. +enum MaybeIndexSummary { + /// A summary which has not been parsed, The `start` and `end` are pointers + /// into `Summaries::raw_data` which this is an entry of. + Unparsed { start: usize, end: usize }, + + /// An actually parsed summary. + Parsed(IndexSummary), +} + +/// A parsed representation of a summary from the index. +/// +/// In addition to a full `Summary` we have information on whether it is `yanked`. +pub struct IndexSummary { + pub summary: Summary, + pub yanked: bool, + /// Schema version, see [`RegistryPackage`]. + v: u32, +} + +/// A representation of the cache on disk that Cargo maintains of summaries. +/// Cargo will initially parse all summaries in the registry and will then +/// serialize that into this form and place it in a new location on disk, +/// ensuring that access in the future is much speedier. +#[derive(Default)] +struct SummariesCache<'a> { + versions: Vec<(Version, &'a [u8])>, + index_version: &'a str, +} + +impl<'cfg> RegistryIndex<'cfg> { + pub fn new( + source_id: SourceId, + path: &Filesystem, + config: &'cfg Config, + ) -> RegistryIndex<'cfg> { + RegistryIndex { + source_id, + path: path.clone(), + summaries_cache: HashMap::new(), + config, + } + } + + /// Returns the hash listed for a specified `PackageId`. + pub fn hash(&mut self, pkg: PackageId, load: &mut dyn RegistryData) -> Poll<CargoResult<&str>> { + let req = OptVersionReq::exact(pkg.version()); + let summary = self.summaries(pkg.name(), &req, load)?; + let summary = ready!(summary).next(); + Poll::Ready(Ok(summary + .ok_or_else(|| internal(format!("no hash listed for {}", pkg)))? + .summary + .checksum() + .ok_or_else(|| internal(format!("no hash listed for {}", pkg)))?)) + } + + /// Load a list of summaries for `name` package in this registry which + /// match `req` + /// + /// This function will semantically parse the on-disk index, match all + /// versions, and then return an iterator over all summaries which matched. + /// Internally there's quite a few layer of caching to amortize this cost + /// though since this method is called quite a lot on null builds in Cargo. + pub fn summaries<'a, 'b>( + &'a mut self, + name: InternedString, + req: &'b OptVersionReq, + load: &mut dyn RegistryData, + ) -> Poll<CargoResult<impl Iterator<Item = &'a IndexSummary> + 'b>> + where + 'a: 'b, + { + let source_id = self.source_id; + let config = self.config; + + // First up actually parse what summaries we have available. If Cargo + // has run previously this will parse a Cargo-specific cache file rather + // than the registry itself. In effect this is intended to be a quite + // cheap operation. + let summaries = ready!(self.load_summaries(name, load)?); + + // Iterate over our summaries, extract all relevant ones which match our + // version requirement, and then parse all corresponding rows in the + // registry. As a reminder this `summaries` method is called for each + // entry in a lock file on every build, so we want to absolutely + // minimize the amount of work being done here and parse as little as + // necessary. + let raw_data = &summaries.raw_data; + Poll::Ready(Ok(summaries + .versions + .iter_mut() + .filter_map(move |(k, v)| if req.matches(k) { Some(v) } else { None }) + .filter_map( + move |maybe| match maybe.parse(config, raw_data, source_id) { + Ok(summary) => Some(summary), + Err(e) => { + info!("failed to parse `{}` registry package: {}", name, e); + None + } + }, + ) + .filter(move |is| { + if is.v > INDEX_V_MAX { + debug!( + "unsupported schema version {} ({} {})", + is.v, + is.summary.name(), + is.summary.version() + ); + false + } else { + true + } + }))) + } + + fn load_summaries( + &mut self, + name: InternedString, + load: &mut dyn RegistryData, + ) -> Poll<CargoResult<&mut Summaries>> { + // If we've previously loaded what versions are present for `name`, just + // return that since our cache should still be valid. + if self.summaries_cache.contains_key(&name) { + return Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap())); + } + + // Prepare the `RegistryData` which will lazily initialize internal data + // structures. + load.prepare()?; + + let root = load.assert_index_locked(&self.path); + let cache_root = root.join(".cache"); + + // See module comment in `registry/mod.rs` for why this is structured + // the way it is. + let fs_name = name + .chars() + .flat_map(|c| c.to_lowercase()) + .collect::<String>(); + let raw_path = make_dep_path(&fs_name, false); + + let mut any_pending = false; + // Attempt to handle misspellings by searching for a chain of related + // names to the original `raw_path` name. Only return summaries + // associated with the first hit, however. The resolver will later + // reject any candidates that have the wrong name, and with this it'll + // along the way produce helpful "did you mean?" suggestions. + for (i, path) in UncanonicalizedIter::new(&raw_path).take(1024).enumerate() { + let summaries = Summaries::parse( + root, + &cache_root, + path.as_ref(), + self.source_id, + load, + self.config, + )?; + if summaries.is_pending() { + if i == 0 { + // If we have not herd back about the name as requested + // then don't ask about other spellings yet. + // This prevents us spamming all the variations in the + // case where we have the correct spelling. + return Poll::Pending; + } + any_pending = true; + } + if let Poll::Ready(Some(summaries)) = summaries { + self.summaries_cache.insert(name, summaries); + return Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap())); + } + } + + if any_pending { + return Poll::Pending; + } + + // If nothing was found then this crate doesn't exists, so just use an + // empty `Summaries` list. + self.summaries_cache.insert(name, Summaries::default()); + Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap())) + } + + /// Clears the in-memory summaries cache. + pub fn clear_summaries_cache(&mut self) { + self.summaries_cache.clear(); + } + + pub fn query_inner( + &mut self, + dep: &Dependency, + load: &mut dyn RegistryData, + yanked_whitelist: &HashSet<PackageId>, + f: &mut dyn FnMut(Summary), + ) -> Poll<CargoResult<()>> { + if self.config.offline() { + // This should only return `Poll::Ready(Ok(()))` if there is at least 1 match. + // + // If there are 0 matches it should fall through and try again with online. + // This is necessary for dependencies that are not used (such as + // target-cfg or optional), but are not downloaded. Normally the + // build should succeed if they are not downloaded and not used, + // but they still need to resolve. If they are actually needed + // then cargo will fail to download and an error message + // indicating that the required dependency is unavailable while + // offline will be displayed. + if ready!(self.query_inner_with_online(dep, load, yanked_whitelist, f, false)?) > 0 { + return Poll::Ready(Ok(())); + } + } + self.query_inner_with_online(dep, load, yanked_whitelist, f, true) + .map_ok(|_| ()) + } + + fn query_inner_with_online( + &mut self, + dep: &Dependency, + load: &mut dyn RegistryData, + yanked_whitelist: &HashSet<PackageId>, + f: &mut dyn FnMut(Summary), + online: bool, + ) -> Poll<CargoResult<usize>> { + let source_id = self.source_id; + + let summaries = ready!(self.summaries(dep.package_name(), dep.version_req(), load))?; + + let summaries = summaries + // First filter summaries for `--offline`. If we're online then + // everything is a candidate, otherwise if we're offline we're only + // going to consider candidates which are actually present on disk. + // + // Note: This particular logic can cause problems with + // optional dependencies when offline. If at least 1 version + // of an optional dependency is downloaded, but that version + // does not satisfy the requirements, then resolution will + // fail. Unfortunately, whether or not something is optional + // is not known here. + .filter(|s| (online || load.is_crate_downloaded(s.summary.package_id()))) + // Next filter out all yanked packages. Some yanked packages may + // leak through if they're in a whitelist (aka if they were + // previously in `Cargo.lock` + .filter(|s| !s.yanked || yanked_whitelist.contains(&s.summary.package_id())) + .map(|s| s.summary.clone()); + + // Handle `cargo update --precise` here. If specified, our own source + // will have a precise version listed of the form + // `<pkg>=<p_req>o-><f_req>` where `<pkg>` is the name of a crate on + // this source, `<p_req>` is the version installed and `<f_req> is the + // version requested (argument to `--precise`). + let name = dep.package_name().as_str(); + let precise = match source_id.precise() { + Some(p) if p.starts_with(name) && p[name.len()..].starts_with('=') => { + let mut vers = p[name.len() + 1..].splitn(2, "->"); + let current_vers = vers.next().unwrap().to_semver().unwrap(); + let requested_vers = vers.next().unwrap().to_semver().unwrap(); + Some((current_vers, requested_vers)) + } + _ => None, + }; + let summaries = summaries.filter(|s| match &precise { + Some((current, requested)) => { + if dep.version_req().matches(current) { + // Unfortunately crates.io allows versions to differ only + // by build metadata. This shouldn't be allowed, but since + // it is, this will honor it if requested. However, if not + // specified, then ignore it. + let s_vers = s.version(); + match (s_vers.build.is_empty(), requested.build.is_empty()) { + (true, true) => s_vers == requested, + (true, false) => false, + (false, true) => { + // Strip out the metadata. + s_vers.major == requested.major + && s_vers.minor == requested.minor + && s_vers.patch == requested.patch + && s_vers.pre == requested.pre + } + (false, false) => s_vers == requested, + } + } else { + true + } + } + None => true, + }); + + let mut count = 0; + for summary in summaries { + f(summary); + count += 1; + } + Poll::Ready(Ok(count)) + } + + pub fn is_yanked( + &mut self, + pkg: PackageId, + load: &mut dyn RegistryData, + ) -> Poll<CargoResult<bool>> { + let req = OptVersionReq::exact(pkg.version()); + let found = self + .summaries(pkg.name(), &req, load) + .map_ok(|mut p| p.any(|summary| summary.yanked)); + found + } +} + +impl Summaries { + /// Parse out a `Summaries` instances from on-disk state. + /// + /// This will attempt to prefer parsing a previous cache file that already + /// exists from a previous invocation of Cargo (aka you're typing `cargo + /// build` again after typing it previously). If parsing fails or the cache + /// isn't found, then we take a slower path which loads the full descriptor + /// for `relative` from the underlying index (aka typically libgit2 with + /// crates.io) and then parse everything in there. + /// + /// * `root` - this is the root argument passed to `load` + /// * `cache_root` - this is the root on the filesystem itself of where to + /// store cache files. + /// * `relative` - this is the file we're loading from cache or the index + /// data + /// * `source_id` - the registry's SourceId used when parsing JSON blobs to + /// create summaries. + /// * `load` - the actual index implementation which may be very slow to + /// call. We avoid this if we can. + pub fn parse( + root: &Path, + cache_root: &Path, + relative: &Path, + source_id: SourceId, + load: &mut dyn RegistryData, + config: &Config, + ) -> Poll<CargoResult<Option<Summaries>>> { + // First up, attempt to load the cache. This could fail for all manner + // of reasons, but consider all of them non-fatal and just log their + // occurrence in case anyone is debugging anything. + let cache_path = cache_root.join(relative); + let mut cached_summaries = None; + let mut index_version = None; + match fs::read(&cache_path) { + Ok(contents) => match Summaries::parse_cache(contents) { + Ok((s, v)) => { + cached_summaries = Some(s); + index_version = Some(v); + } + Err(e) => { + log::debug!("failed to parse {:?} cache: {}", relative, e); + } + }, + Err(e) => log::debug!("cache missing for {:?} error: {}", relative, e), + } + + let response = ready!(load.load(root, relative, index_version.as_deref())?); + + match response { + LoadResponse::CacheValid => { + log::debug!("fast path for registry cache of {:?}", relative); + return Poll::Ready(Ok(cached_summaries)); + } + LoadResponse::NotFound => { + if let Err(e) = fs::remove_file(cache_path) { + if e.kind() != ErrorKind::NotFound { + log::debug!("failed to remove from cache: {}", e); + } + } + return Poll::Ready(Ok(None)); + } + LoadResponse::Data { + raw_data, + index_version, + } => { + // This is the fallback path where we actually talk to the registry backend to load + // information. Here we parse every single line in the index (as we need + // to find the versions) + log::debug!("slow path for {:?}", relative); + let mut cache = SummariesCache::default(); + let mut ret = Summaries::default(); + ret.raw_data = raw_data; + for line in split(&ret.raw_data, b'\n') { + // Attempt forwards-compatibility on the index by ignoring + // everything that we ourselves don't understand, that should + // allow future cargo implementations to break the + // interpretation of each line here and older cargo will simply + // ignore the new lines. + let summary = match IndexSummary::parse(config, line, source_id) { + Ok(summary) => summary, + Err(e) => { + // This should only happen when there is an index + // entry from a future version of cargo that this + // version doesn't understand. Hopefully, those future + // versions of cargo correctly set INDEX_V_MAX and + // CURRENT_CACHE_VERSION, otherwise this will skip + // entries in the cache preventing those newer + // versions from reading them (that is, until the + // cache is rebuilt). + log::info!("failed to parse {:?} registry package: {}", relative, e); + continue; + } + }; + let version = summary.summary.package_id().version().clone(); + cache.versions.push((version.clone(), line)); + ret.versions.insert(version, summary.into()); + } + if let Some(index_version) = index_version { + log::trace!("caching index_version {}", index_version); + let cache_bytes = cache.serialize(index_version.as_str()); + // Once we have our `cache_bytes` which represents the `Summaries` we're + // about to return, write that back out to disk so future Cargo + // invocations can use it. + // + // This is opportunistic so we ignore failure here but are sure to log + // something in case of error. + if paths::create_dir_all(cache_path.parent().unwrap()).is_ok() { + let path = Filesystem::new(cache_path.clone()); + config.assert_package_cache_locked(&path); + if let Err(e) = fs::write(cache_path, &cache_bytes) { + log::info!("failed to write cache: {}", e); + } + } + + // If we've got debug assertions enabled read back in the cached values + // and assert they match the expected result. + #[cfg(debug_assertions)] + { + let readback = SummariesCache::parse(&cache_bytes) + .expect("failed to parse cache we just wrote"); + assert_eq!( + readback.index_version, index_version, + "index_version mismatch" + ); + assert_eq!(readback.versions, cache.versions, "versions mismatch"); + } + } + Poll::Ready(Ok(Some(ret))) + } + } + } + + /// Parses an open `File` which represents information previously cached by + /// Cargo. + pub fn parse_cache(contents: Vec<u8>) -> CargoResult<(Summaries, InternedString)> { + let cache = SummariesCache::parse(&contents)?; + let index_version = InternedString::new(cache.index_version); + let mut ret = Summaries::default(); + for (version, summary) in cache.versions { + let (start, end) = subslice_bounds(&contents, summary); + ret.versions + .insert(version, MaybeIndexSummary::Unparsed { start, end }); + } + ret.raw_data = contents; + return Ok((ret, index_version)); + + // Returns the start/end offsets of `inner` with `outer`. Asserts that + // `inner` is a subslice of `outer`. + fn subslice_bounds(outer: &[u8], inner: &[u8]) -> (usize, usize) { + let outer_start = outer.as_ptr() as usize; + let outer_end = outer_start + outer.len(); + let inner_start = inner.as_ptr() as usize; + let inner_end = inner_start + inner.len(); + assert!(inner_start >= outer_start); + assert!(inner_end <= outer_end); + (inner_start - outer_start, inner_end - outer_start) + } + } +} + +// Implementation of serializing/deserializing the cache of summaries on disk. +// Currently the format looks like: +// +// +--------------------+----------------------+-------------+---+ +// | cache version byte | index format version | git sha rev | 0 | +// +--------------------+----------------------+-------------+---+ +// +// followed by... +// +// +----------------+---+------------+---+ +// | semver version | 0 | JSON blob | 0 | ... +// +----------------+---+------------+---+ +// +// The idea is that this is a very easy file for Cargo to parse in future +// invocations. The read from disk should be quite fast and then afterwards all +// we need to know is what versions correspond to which JSON blob. +// +// The leading version byte is intended to ensure that there's some level of +// future compatibility against changes to this cache format so if different +// versions of Cargo share the same cache they don't get too confused. The git +// sha lets us know when the file needs to be regenerated (it needs regeneration +// whenever the index itself updates). +// +// Cache versions: +// * `1`: The original version. +// * `2`: Added the "index format version" field so that if the index format +// changes, different versions of cargo won't get confused reading each +// other's caches. +// * `3`: Bumped the version to work around an issue where multiple versions of +// a package were published that differ only by semver metadata. For +// example, openssl-src 110.0.0 and 110.0.0+1.1.0f. Previously, the cache +// would be incorrectly populated with two entries, both 110.0.0. After +// this, the metadata will be correctly included. This isn't really a format +// change, just a version bump to clear the incorrect cache entries. Note: +// the index shouldn't allow these, but unfortunately crates.io doesn't +// check it. + +const CURRENT_CACHE_VERSION: u8 = 3; + +impl<'a> SummariesCache<'a> { + fn parse(data: &'a [u8]) -> CargoResult<SummariesCache<'a>> { + // NB: keep this method in sync with `serialize` below + let (first_byte, rest) = data + .split_first() + .ok_or_else(|| anyhow::format_err!("malformed cache"))?; + if *first_byte != CURRENT_CACHE_VERSION { + bail!("looks like a different Cargo's cache, bailing out"); + } + let index_v_bytes = rest + .get(..4) + .ok_or_else(|| anyhow::anyhow!("cache expected 4 bytes for index version"))?; + let index_v = u32::from_le_bytes(index_v_bytes.try_into().unwrap()); + if index_v != INDEX_V_MAX { + bail!( + "index format version {} doesn't match the version I know ({})", + index_v, + INDEX_V_MAX + ); + } + let rest = &rest[4..]; + + let mut iter = split(rest, 0); + let last_index_update = if let Some(update) = iter.next() { + str::from_utf8(update)? + } else { + bail!("malformed file"); + }; + let mut ret = SummariesCache::default(); + ret.index_version = last_index_update; + while let Some(version) = iter.next() { + let version = str::from_utf8(version)?; + let version = Version::parse(version)?; + let summary = iter.next().unwrap(); + ret.versions.push((version, summary)); + } + Ok(ret) + } + + fn serialize(&self, index_version: &str) -> Vec<u8> { + // NB: keep this method in sync with `parse` above + let size = self + .versions + .iter() + .map(|(_version, data)| (10 + data.len())) + .sum(); + let mut contents = Vec::with_capacity(size); + contents.push(CURRENT_CACHE_VERSION); + contents.extend(&u32::to_le_bytes(INDEX_V_MAX)); + contents.extend_from_slice(index_version.as_bytes()); + contents.push(0); + for (version, data) in self.versions.iter() { + contents.extend_from_slice(version.to_string().as_bytes()); + contents.push(0); + contents.extend_from_slice(data); + contents.push(0); + } + contents + } +} + +impl MaybeIndexSummary { + /// Parses this "maybe a summary" into a `Parsed` for sure variant. + /// + /// Does nothing if this is already `Parsed`, and otherwise the `raw_data` + /// passed in is sliced with the bounds in `Unparsed` and then actually + /// parsed. + fn parse( + &mut self, + config: &Config, + raw_data: &[u8], + source_id: SourceId, + ) -> CargoResult<&IndexSummary> { + let (start, end) = match self { + MaybeIndexSummary::Unparsed { start, end } => (*start, *end), + MaybeIndexSummary::Parsed(summary) => return Ok(summary), + }; + let summary = IndexSummary::parse(config, &raw_data[start..end], source_id)?; + *self = MaybeIndexSummary::Parsed(summary); + match self { + MaybeIndexSummary::Unparsed { .. } => unreachable!(), + MaybeIndexSummary::Parsed(summary) => Ok(summary), + } + } +} + +impl From<IndexSummary> for MaybeIndexSummary { + fn from(summary: IndexSummary) -> MaybeIndexSummary { + MaybeIndexSummary::Parsed(summary) + } +} + +impl IndexSummary { + /// Parses a line from the registry's index file into an `IndexSummary` for + /// a package. + /// + /// The `line` provided is expected to be valid JSON. + fn parse(config: &Config, line: &[u8], source_id: SourceId) -> CargoResult<IndexSummary> { + // ****CAUTION**** Please be extremely careful with returning errors + // from this function. Entries that error are not included in the + // index cache, and can cause cargo to get confused when switching + // between different versions that understand the index differently. + // Make sure to consider the INDEX_V_MAX and CURRENT_CACHE_VERSION + // values carefully when making changes here. + let RegistryPackage { + name, + vers, + cksum, + deps, + mut features, + features2, + yanked, + links, + v, + } = serde_json::from_slice(line)?; + let v = v.unwrap_or(1); + log::trace!("json parsed registry {}/{}", name, vers); + let pkgid = PackageId::new(name, &vers, source_id)?; + let deps = deps + .into_iter() + .map(|dep| dep.into_dep(source_id)) + .collect::<CargoResult<Vec<_>>>()?; + if let Some(features2) = features2 { + for (name, values) in features2 { + features.entry(name).or_default().extend(values); + } + } + let mut summary = Summary::new(config, pkgid, deps, &features, links)?; + summary.set_checksum(cksum); + Ok(IndexSummary { + summary, + yanked: yanked.unwrap_or(false), + v, + }) + } +} + +fn split(haystack: &[u8], needle: u8) -> impl Iterator<Item = &[u8]> { + struct Split<'a> { + haystack: &'a [u8], + needle: u8, + } + + impl<'a> Iterator for Split<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option<&'a [u8]> { + if self.haystack.is_empty() { + return None; + } + let (ret, remaining) = match memchr::memchr(self.needle, self.haystack) { + Some(pos) => (&self.haystack[..pos], &self.haystack[pos + 1..]), + None => (self.haystack, &[][..]), + }; + self.haystack = remaining; + Some(ret) + } + } + + Split { haystack, needle } +} diff --git a/src/cargo/sources/registry/local.rs b/src/cargo/sources/registry/local.rs new file mode 100644 index 0000000..a4b57a9 --- /dev/null +++ b/src/cargo/sources/registry/local.rs @@ -0,0 +1,149 @@ +use crate::core::PackageId; +use crate::sources::registry::{LoadResponse, MaybeLock, RegistryConfig, RegistryData}; +use crate::util::errors::CargoResult; +use crate::util::{Config, Filesystem}; +use cargo_util::{paths, Sha256}; +use std::fs::File; +use std::io::SeekFrom; +use std::io::{self, prelude::*}; +use std::path::Path; +use std::task::Poll; + +/// A local registry is a registry that lives on the filesystem as a set of +/// `.crate` files with an `index` directory in the same format as a remote +/// registry. +pub struct LocalRegistry<'cfg> { + index_path: Filesystem, + root: Filesystem, + src_path: Filesystem, + config: &'cfg Config, + updated: bool, +} + +impl<'cfg> LocalRegistry<'cfg> { + pub fn new(root: &Path, config: &'cfg Config, name: &str) -> LocalRegistry<'cfg> { + LocalRegistry { + src_path: config.registry_source_path().join(name), + index_path: Filesystem::new(root.join("index")), + root: Filesystem::new(root.to_path_buf()), + config, + updated: false, + } + } +} + +impl<'cfg> RegistryData for LocalRegistry<'cfg> { + fn prepare(&self) -> CargoResult<()> { + Ok(()) + } + + fn index_path(&self) -> &Filesystem { + &self.index_path + } + + fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path { + // Note that the `*_unlocked` variant is used here since we're not + // modifying the index and it's required to be externally synchronized. + path.as_path_unlocked() + } + + fn load( + &mut self, + root: &Path, + path: &Path, + _index_version: Option<&str>, + ) -> Poll<CargoResult<LoadResponse>> { + if self.updated { + let raw_data = match paths::read_bytes(&root.join(path)) { + Err(e) + if e.downcast_ref::<io::Error>() + .map_or(false, |ioe| ioe.kind() == io::ErrorKind::NotFound) => + { + return Poll::Ready(Ok(LoadResponse::NotFound)); + } + r => r, + }?; + Poll::Ready(Ok(LoadResponse::Data { + raw_data, + index_version: None, + })) + } else { + Poll::Pending + } + } + + fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> { + // Local registries don't have configuration for remote APIs or anything + // like that + Poll::Ready(Ok(None)) + } + + fn block_until_ready(&mut self) -> CargoResult<()> { + if self.updated { + return Ok(()); + } + // Nothing to update, we just use what's on disk. Verify it actually + // exists though. We don't use any locks as we're just checking whether + // these directories exist. + let root = self.root.clone().into_path_unlocked(); + if !root.is_dir() { + anyhow::bail!("local registry path is not a directory: {}", root.display()); + } + let index_path = self.index_path.clone().into_path_unlocked(); + if !index_path.is_dir() { + anyhow::bail!( + "local registry index path is not a directory: {}", + index_path.display() + ); + } + self.updated = true; + Ok(()) + } + + fn invalidate_cache(&mut self) { + // Local registry has no cache - just reads from disk. + } + + fn is_updated(&self) -> bool { + self.updated + } + + fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> { + let crate_file = format!("{}-{}.crate", pkg.name(), pkg.version()); + + // Note that the usage of `into_path_unlocked` here is because the local + // crate files here never change in that we're not the one writing them, + // so it's not our responsibility to synchronize access to them. + let path = self.root.join(&crate_file).into_path_unlocked(); + let mut crate_file = paths::open(&path)?; + + // If we've already got an unpacked version of this crate, then skip the + // checksum below as it is in theory already verified. + let dst = format!("{}-{}", pkg.name(), pkg.version()); + if self.src_path.join(dst).into_path_unlocked().exists() { + return Ok(MaybeLock::Ready(crate_file)); + } + + self.config.shell().status("Unpacking", pkg)?; + + // We don't actually need to download anything per-se, we just need to + // verify the checksum matches the .crate file itself. + let actual = Sha256::new().update_file(&crate_file)?.finish_hex(); + if actual != checksum { + anyhow::bail!("failed to verify the checksum of `{}`", pkg) + } + + crate_file.seek(SeekFrom::Start(0))?; + + Ok(MaybeLock::Ready(crate_file)) + } + + fn finish_download( + &mut self, + _pkg: PackageId, + _checksum: &str, + _data: &[u8], + ) -> CargoResult<File> { + panic!("this source doesn't download") + } +} diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs new file mode 100644 index 0000000..930a42f --- /dev/null +++ b/src/cargo/sources/registry/mod.rs @@ -0,0 +1,970 @@ +//! A `Source` for registry-based packages. +//! +//! # What's a Registry? +//! +//! Registries are central locations where packages can be uploaded to, +//! discovered, and searched for. The purpose of a registry is to have a +//! location that serves as permanent storage for versions of a crate over time. +//! +//! Compared to git sources, a registry provides many packages as well as many +//! versions simultaneously. Git sources can also have commits deleted through +//! rebasings where registries cannot have their versions deleted. +//! +//! # The Index of a Registry +//! +//! One of the major difficulties with a registry is that hosting so many +//! packages may quickly run into performance problems when dealing with +//! dependency graphs. It's infeasible for cargo to download the entire contents +//! of the registry just to resolve one package's dependencies, for example. As +//! a result, cargo needs some efficient method of querying what packages are +//! available on a registry, what versions are available, and what the +//! dependencies for each version is. +//! +//! One method of doing so would be having the registry expose an HTTP endpoint +//! which can be queried with a list of packages and a response of their +//! dependencies and versions is returned. This is somewhat inefficient however +//! as we may have to hit the endpoint many times and we may have already +//! queried for much of the data locally already (for other packages, for +//! example). This also involves inventing a transport format between the +//! registry and Cargo itself, so this route was not taken. +//! +//! Instead, Cargo communicates with registries through a git repository +//! referred to as the Index. The Index of a registry is essentially an easily +//! query-able version of the registry's database for a list of versions of a +//! package as well as a list of dependencies for each version. +//! +//! Using git to host this index provides a number of benefits: +//! +//! * The entire index can be stored efficiently locally on disk. This means +//! that all queries of a registry can happen locally and don't need to touch +//! the network. +//! +//! * Updates of the index are quite efficient. Using git buys incremental +//! updates, compressed transmission, etc for free. The index must be updated +//! each time we need fresh information from a registry, but this is one +//! update of a git repository that probably hasn't changed a whole lot so +//! it shouldn't be too expensive. +//! +//! Additionally, each modification to the index is just appending a line at +//! the end of a file (the exact format is described later). This means that +//! the commits for an index are quite small and easily applied/compressible. +//! +//! ## The format of the Index +//! +//! The index is a store for the list of versions for all packages known, so its +//! format on disk is optimized slightly to ensure that `ls registry` doesn't +//! produce a list of all packages ever known. The index also wants to ensure +//! that there's not a million files which may actually end up hitting +//! filesystem limits at some point. To this end, a few decisions were made +//! about the format of the registry: +//! +//! 1. Each crate will have one file corresponding to it. Each version for a +//! crate will just be a line in this file. +//! 2. There will be two tiers of directories for crate names, under which +//! crates corresponding to those tiers will be located. +//! +//! As an example, this is an example hierarchy of an index: +//! +//! ```notrust +//! . +//! ├── 3 +//! │  └── u +//! │  └── url +//! ├── bz +//! │  └── ip +//! │  └── bzip2 +//! ├── config.json +//! ├── en +//! │  └── co +//! │  └── encoding +//! └── li +//!   ├── bg +//!   │  └── libgit2 +//!   └── nk +//!   └── link-config +//! ``` +//! +//! The root of the index contains a `config.json` file with a few entries +//! corresponding to the registry (see [`RegistryConfig`] below). +//! +//! Otherwise, there are three numbered directories (1, 2, 3) for crates with +//! names 1, 2, and 3 characters in length. The 1/2 directories simply have the +//! crate files underneath them, while the 3 directory is sharded by the first +//! letter of the crate name. +//! +//! Otherwise the top-level directory contains many two-letter directory names, +//! each of which has many sub-folders with two letters. At the end of all these +//! are the actual crate files themselves. +//! +//! The purpose of this layout is to hopefully cut down on `ls` sizes as well as +//! efficient lookup based on the crate name itself. +//! +//! ## Crate files +//! +//! Each file in the index is the history of one crate over time. Each line in +//! the file corresponds to one version of a crate, stored in JSON format (see +//! the `RegistryPackage` structure below). +//! +//! As new versions are published, new lines are appended to this file. The only +//! modifications to this file that should happen over time are yanks of a +//! particular version. +//! +//! # Downloading Packages +//! +//! The purpose of the Index was to provide an efficient method to resolve the +//! dependency graph for a package. So far we only required one network +//! interaction to update the registry's repository (yay!). After resolution has +//! been performed, however we need to download the contents of packages so we +//! can read the full manifest and build the source code. +//! +//! To accomplish this, this source's `download` method will make an HTTP +//! request per-package requested to download tarballs into a local cache. These +//! tarballs will then be unpacked into a destination folder. +//! +//! Note that because versions uploaded to the registry are frozen forever that +//! the HTTP download and unpacking can all be skipped if the version has +//! already been downloaded and unpacked. This caching allows us to only +//! download a package when absolutely necessary. +//! +//! # Filesystem Hierarchy +//! +//! Overall, the `$HOME/.cargo` looks like this when talking about the registry: +//! +//! ```notrust +//! # A folder under which all registry metadata is hosted (similar to +//! # $HOME/.cargo/git) +//! $HOME/.cargo/registry/ +//! +//! # For each registry that cargo knows about (keyed by hostname + hash) +//! # there is a folder which is the checked out version of the index for +//! # the registry in this location. Note that this is done so cargo can +//! # support multiple registries simultaneously +//! index/ +//! registry1-<hash>/ +//! registry2-<hash>/ +//! ... +//! +//! # This folder is a cache for all downloaded tarballs from a registry. +//! # Once downloaded and verified, a tarball never changes. +//! cache/ +//! registry1-<hash>/<pkg>-<version>.crate +//! ... +//! +//! # Location in which all tarballs are unpacked. Each tarball is known to +//! # be frozen after downloading, so transitively this folder is also +//! # frozen once its unpacked (it's never unpacked again) +//! src/ +//! registry1-<hash>/<pkg>-<version>/... +//! ... +//! ``` + +use std::borrow::Cow; +use std::collections::BTreeMap; +use std::collections::HashSet; +use std::fs::{File, OpenOptions}; +use std::io::{self, Write}; +use std::path::{Path, PathBuf}; +use std::task::Poll; + +use anyhow::Context as _; +use cargo_util::paths::{self, exclude_from_backups_and_indexing}; +use flate2::read::GzDecoder; +use log::debug; +use semver::Version; +use serde::Deserialize; +use tar::Archive; + +use crate::core::dependency::{DepKind, Dependency}; +use crate::core::source::MaybePackage; +use crate::core::{Package, PackageId, QueryKind, Source, SourceId, Summary}; +use crate::sources::PathSource; +use crate::util::hex; +use crate::util::interning::InternedString; +use crate::util::into_url::IntoUrl; +use crate::util::network::PollExt; +use crate::util::{ + restricted_names, CargoResult, Config, Filesystem, LimitErrorReader, OptVersionReq, +}; + +const PACKAGE_SOURCE_LOCK: &str = ".cargo-ok"; +pub const CRATES_IO_INDEX: &str = "https://github.com/rust-lang/crates.io-index"; +pub const CRATES_IO_HTTP_INDEX: &str = "sparse+https://index.crates.io/"; +pub const CRATES_IO_REGISTRY: &str = "crates-io"; +pub const CRATES_IO_DOMAIN: &str = "crates.io"; +const CRATE_TEMPLATE: &str = "{crate}"; +const VERSION_TEMPLATE: &str = "{version}"; +const PREFIX_TEMPLATE: &str = "{prefix}"; +const LOWER_PREFIX_TEMPLATE: &str = "{lowerprefix}"; +const CHECKSUM_TEMPLATE: &str = "{sha256-checksum}"; +const MAX_UNPACK_SIZE: u64 = 512 * 1024 * 1024; +const MAX_COMPRESSION_RATIO: usize = 20; // 20:1 + +/// A "source" for a local (see `local::LocalRegistry`) or remote (see +/// `remote::RemoteRegistry`) registry. +/// +/// This contains common functionality that is shared between the two registry +/// kinds, with the registry-specific logic implemented as part of the +/// [`RegistryData`] trait referenced via the `ops` field. +pub struct RegistrySource<'cfg> { + source_id: SourceId, + /// The path where crate files are extracted (`$CARGO_HOME/registry/src/$REG-HASH`). + src_path: Filesystem, + /// Local reference to [`Config`] for convenience. + config: &'cfg Config, + /// Abstraction for interfacing to the different registry kinds. + ops: Box<dyn RegistryData + 'cfg>, + /// Interface for managing the on-disk index. + index: index::RegistryIndex<'cfg>, + /// A set of packages that should be allowed to be used, even if they are + /// yanked. + /// + /// This is populated from the entries in `Cargo.lock` to ensure that + /// `cargo update -p somepkg` won't unlock yanked entries in `Cargo.lock`. + /// Otherwise, the resolver would think that those entries no longer + /// exist, and it would trigger updates to unrelated packages. + yanked_whitelist: HashSet<PackageId>, +} + +/// The `config.json` file stored in the index. +#[derive(Deserialize, Debug, Clone)] +#[serde(rename_all = "kebab-case")] +pub struct RegistryConfig { + /// Download endpoint for all crates. + /// + /// The string is a template which will generate the download URL for the + /// tarball of a specific version of a crate. The substrings `{crate}` and + /// `{version}` will be replaced with the crate's name and version + /// respectively. The substring `{prefix}` will be replaced with the + /// crate's prefix directory name, and the substring `{lowerprefix}` will + /// be replaced with the crate's prefix directory name converted to + /// lowercase. The substring `{sha256-checksum}` will be replaced with the + /// crate's sha256 checksum. + /// + /// For backwards compatibility, if the string does not contain any + /// markers (`{crate}`, `{version}`, `{prefix}`, or ``{lowerprefix}`), it + /// will be extended with `/{crate}/{version}/download` to + /// support registries like crates.io which were created before the + /// templating setup was created. + pub dl: String, + + /// API endpoint for the registry. This is what's actually hit to perform + /// operations like yanks, owner modifications, publish new crates, etc. + /// If this is None, the registry does not support API commands. + pub api: Option<String>, + + /// Whether all operations require authentication. + #[serde(default)] + pub auth_required: bool, +} + +/// The maximum version of the `v` field in the index this version of cargo +/// understands. +pub(crate) const INDEX_V_MAX: u32 = 2; + +/// A single line in the index representing a single version of a package. +#[derive(Deserialize)] +pub struct RegistryPackage<'a> { + name: InternedString, + vers: Version, + #[serde(borrow)] + deps: Vec<RegistryDependency<'a>>, + features: BTreeMap<InternedString, Vec<InternedString>>, + /// This field contains features with new, extended syntax. Specifically, + /// namespaced features (`dep:`) and weak dependencies (`pkg?/feat`). + /// + /// This is separated from `features` because versions older than 1.19 + /// will fail to load due to not being able to parse the new syntax, even + /// with a `Cargo.lock` file. + features2: Option<BTreeMap<InternedString, Vec<InternedString>>>, + cksum: String, + /// If `true`, Cargo will skip this version when resolving. + /// + /// This was added in 2014. Everything in the crates.io index has this set + /// now, so this probably doesn't need to be an option anymore. + yanked: Option<bool>, + /// Native library name this package links to. + /// + /// Added early 2018 (see <https://github.com/rust-lang/cargo/pull/4978>), + /// can be `None` if published before then. + links: Option<InternedString>, + /// The schema version for this entry. + /// + /// If this is None, it defaults to version 1. Entries with unknown + /// versions are ignored. + /// + /// Version `2` format adds the `features2` field. + /// + /// This provides a method to safely introduce changes to index entries + /// and allow older versions of cargo to ignore newer entries it doesn't + /// understand. This is honored as of 1.51, so unfortunately older + /// versions will ignore it, and potentially misinterpret version 2 and + /// newer entries. + /// + /// The intent is that versions older than 1.51 will work with a + /// pre-existing `Cargo.lock`, but they may not correctly process `cargo + /// update` or build a lock from scratch. In that case, cargo may + /// incorrectly select a new package that uses a new index format. A + /// workaround is to downgrade any packages that are incompatible with the + /// `--precise` flag of `cargo update`. + v: Option<u32>, +} + +#[test] +fn escaped_char_in_json() { + let _: RegistryPackage<'_> = serde_json::from_str( + r#"{"name":"a","vers":"0.0.1","deps":[],"cksum":"bae3","features":{}}"#, + ) + .unwrap(); + let _: RegistryPackage<'_> = serde_json::from_str( + r#"{"name":"a","vers":"0.0.1","deps":[],"cksum":"bae3","features":{"test":["k","q"]},"links":"a-sys"}"# + ).unwrap(); + + // Now we add escaped cher all the places they can go + // these are not valid, but it should error later than json parsing + let _: RegistryPackage<'_> = serde_json::from_str( + r#"{ + "name":"This name has a escaped cher in it \n\t\" ", + "vers":"0.0.1", + "deps":[{ + "name": " \n\t\" ", + "req": " \n\t\" ", + "features": [" \n\t\" "], + "optional": true, + "default_features": true, + "target": " \n\t\" ", + "kind": " \n\t\" ", + "registry": " \n\t\" " + }], + "cksum":"bae3", + "features":{"test \n\t\" ":["k \n\t\" ","q \n\t\" "]}, + "links":" \n\t\" "}"#, + ) + .unwrap(); +} + +/// A dependency as encoded in the index JSON. +#[derive(Deserialize)] +struct RegistryDependency<'a> { + name: InternedString, + #[serde(borrow)] + req: Cow<'a, str>, + features: Vec<InternedString>, + optional: bool, + default_features: bool, + target: Option<Cow<'a, str>>, + kind: Option<Cow<'a, str>>, + registry: Option<Cow<'a, str>>, + package: Option<InternedString>, + public: Option<bool>, +} + +impl<'a> RegistryDependency<'a> { + /// Converts an encoded dependency in the registry to a cargo dependency + pub fn into_dep(self, default: SourceId) -> CargoResult<Dependency> { + let RegistryDependency { + name, + req, + mut features, + optional, + default_features, + target, + kind, + registry, + package, + public, + } = self; + + let id = if let Some(registry) = ®istry { + SourceId::for_registry(®istry.into_url()?)? + } else { + default + }; + + let mut dep = Dependency::parse(package.unwrap_or(name), Some(&req), id)?; + if package.is_some() { + dep.set_explicit_name_in_toml(name); + } + let kind = match kind.as_deref().unwrap_or("") { + "dev" => DepKind::Development, + "build" => DepKind::Build, + _ => DepKind::Normal, + }; + + let platform = match target { + Some(target) => Some(target.parse()?), + None => None, + }; + + // All dependencies are private by default + let public = public.unwrap_or(false); + + // Unfortunately older versions of cargo and/or the registry ended up + // publishing lots of entries where the features array contained the + // empty feature, "", inside. This confuses the resolution process much + // later on and these features aren't actually valid, so filter them all + // out here. + features.retain(|s| !s.is_empty()); + + // In index, "registry" is null if it is from the same index. + // In Cargo.toml, "registry" is None if it is from the default + if !id.is_crates_io() { + dep.set_registry_id(id); + } + + dep.set_optional(optional) + .set_default_features(default_features) + .set_features(features) + .set_platform(platform) + .set_kind(kind) + .set_public(public); + + Ok(dep) + } +} + +/// Result from loading data from a registry. +pub enum LoadResponse { + /// The cache is valid. The cached data should be used. + CacheValid, + + /// The cache is out of date. Returned data should be used. + Data { + raw_data: Vec<u8>, + index_version: Option<String>, + }, + + /// The requested crate was found. + NotFound, +} + +/// An abstract interface to handle both a local (see `local::LocalRegistry`) +/// and remote (see `remote::RemoteRegistry`) registry. +/// +/// This allows [`RegistrySource`] to abstractly handle both registry kinds. +pub trait RegistryData { + /// Performs initialization for the registry. + /// + /// This should be safe to call multiple times, the implementation is + /// expected to not do any work if it is already prepared. + fn prepare(&self) -> CargoResult<()>; + + /// Returns the path to the index. + /// + /// Note that different registries store the index in different formats + /// (remote=git, local=files). + fn index_path(&self) -> &Filesystem; + + /// Loads the JSON for a specific named package from the index. + /// + /// * `root` is the root path to the index. + /// * `path` is the relative path to the package to load (like `ca/rg/cargo`). + /// * `index_version` is the version of the requested crate data currently in cache. + fn load( + &mut self, + root: &Path, + path: &Path, + index_version: Option<&str>, + ) -> Poll<CargoResult<LoadResponse>>; + + /// Loads the `config.json` file and returns it. + /// + /// Local registries don't have a config, and return `None`. + fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>>; + + /// Invalidates locally cached data. + fn invalidate_cache(&mut self); + + /// Is the local cached data up-to-date? + fn is_updated(&self) -> bool; + + /// Prepare to start downloading a `.crate` file. + /// + /// Despite the name, this doesn't actually download anything. If the + /// `.crate` is already downloaded, then it returns [`MaybeLock::Ready`]. + /// If it hasn't been downloaded, then it returns [`MaybeLock::Download`] + /// which contains the URL to download. The [`crate::core::package::Downloads`] + /// system handles the actual download process. After downloading, it + /// calls [`Self::finish_download`] to save the downloaded file. + /// + /// `checksum` is currently only used by local registries to verify the + /// file contents (because local registries never actually download + /// anything). Remote registries will validate the checksum in + /// `finish_download`. For already downloaded `.crate` files, it does not + /// validate the checksum, assuming the filesystem does not suffer from + /// corruption or manipulation. + fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock>; + + /// Finish a download by saving a `.crate` file to disk. + /// + /// After [`crate::core::package::Downloads`] has finished a download, + /// it will call this to save the `.crate` file. This is only relevant + /// for remote registries. This should validate the checksum and save + /// the given data to the on-disk cache. + /// + /// Returns a [`File`] handle to the `.crate` file, positioned at the start. + fn finish_download(&mut self, pkg: PackageId, checksum: &str, data: &[u8]) + -> CargoResult<File>; + + /// Returns whether or not the `.crate` file is already downloaded. + fn is_crate_downloaded(&self, _pkg: PackageId) -> bool { + true + } + + /// Validates that the global package cache lock is held. + /// + /// Given the [`Filesystem`], this will make sure that the package cache + /// lock is held. If not, it will panic. See + /// [`Config::acquire_package_cache_lock`] for acquiring the global lock. + /// + /// Returns the [`Path`] to the [`Filesystem`]. + fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path; + + /// Block until all outstanding Poll::Pending requests are Poll::Ready. + fn block_until_ready(&mut self) -> CargoResult<()>; +} + +/// The status of [`RegistryData::download`] which indicates if a `.crate` +/// file has already been downloaded, or if not then the URL to download. +pub enum MaybeLock { + /// The `.crate` file is already downloaded. [`File`] is a handle to the + /// opened `.crate` file on the filesystem. + Ready(File), + /// The `.crate` file is not downloaded, here's the URL to download it from. + /// + /// `descriptor` is just a text string to display to the user of what is + /// being downloaded. + Download { + url: String, + descriptor: String, + authorization: Option<String>, + }, +} + +mod download; +mod http_remote; +mod index; +mod local; +mod remote; + +fn short_name(id: SourceId) -> String { + let hash = hex::short_hash(&id); + let ident = id.url().host_str().unwrap_or("").to_string(); + format!("{}-{}", ident, hash) +} + +impl<'cfg> RegistrySource<'cfg> { + pub fn remote( + source_id: SourceId, + yanked_whitelist: &HashSet<PackageId>, + config: &'cfg Config, + ) -> CargoResult<RegistrySource<'cfg>> { + assert!(source_id.is_remote_registry()); + let name = short_name(source_id); + let ops = if source_id.is_sparse() { + Box::new(http_remote::HttpRegistry::new(source_id, config, &name)?) as Box<_> + } else { + Box::new(remote::RemoteRegistry::new(source_id, config, &name)) as Box<_> + }; + + Ok(RegistrySource::new( + source_id, + config, + &name, + ops, + yanked_whitelist, + )) + } + + pub fn local( + source_id: SourceId, + path: &Path, + yanked_whitelist: &HashSet<PackageId>, + config: &'cfg Config, + ) -> RegistrySource<'cfg> { + let name = short_name(source_id); + let ops = local::LocalRegistry::new(path, config, &name); + RegistrySource::new(source_id, config, &name, Box::new(ops), yanked_whitelist) + } + + fn new( + source_id: SourceId, + config: &'cfg Config, + name: &str, + ops: Box<dyn RegistryData + 'cfg>, + yanked_whitelist: &HashSet<PackageId>, + ) -> RegistrySource<'cfg> { + RegistrySource { + src_path: config.registry_source_path().join(name), + config, + source_id, + index: index::RegistryIndex::new(source_id, ops.index_path(), config), + yanked_whitelist: yanked_whitelist.clone(), + ops, + } + } + + /// Decode the configuration stored within the registry. + /// + /// This requires that the index has been at least checked out. + pub fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> { + self.ops.config() + } + + /// Unpacks a downloaded package into a location where it's ready to be + /// compiled. + /// + /// No action is taken if the source looks like it's already unpacked. + fn unpack_package(&self, pkg: PackageId, tarball: &File) -> CargoResult<PathBuf> { + // The `.cargo-ok` file is used to track if the source is already + // unpacked. + let package_dir = format!("{}-{}", pkg.name(), pkg.version()); + let dst = self.src_path.join(&package_dir); + let path = dst.join(PACKAGE_SOURCE_LOCK); + let path = self.config.assert_package_cache_locked(&path); + let unpack_dir = path.parent().unwrap(); + match path.metadata() { + Ok(meta) if meta.len() > 0 => return Ok(unpack_dir.to_path_buf()), + Ok(_meta) => { + // The `.cargo-ok` file is not in a state we expect it to be + // (with two bytes containing "ok"). + // + // Cargo has always included a `.cargo-ok` file to detect if + // extraction was interrupted, but it was originally empty. + // + // In 1.34, Cargo was changed to create the `.cargo-ok` file + // before it started extraction to implement fine-grained + // locking. After it was finished extracting, it wrote two + // bytes to indicate it was complete. It would use the length + // check to detect if it was possibly interrupted. + // + // In 1.36, Cargo changed to not use fine-grained locking, and + // instead used a global lock. The use of `.cargo-ok` was no + // longer needed for locking purposes, but was kept to detect + // when extraction was interrupted. + // + // In 1.49, Cargo changed to not create the `.cargo-ok` file + // before it started extraction to deal with `.crate` files + // that inexplicably had a `.cargo-ok` file in them. + // + // In 1.64, Cargo changed to detect `.crate` files with + // `.cargo-ok` files in them in response to CVE-2022-36113, + // which dealt with malicious `.crate` files making + // `.cargo-ok` a symlink causing cargo to write "ok" to any + // arbitrary file on the filesystem it has permission to. + // + // This is all a long-winded way of explaining the + // circumstances that might cause a directory to contain a + // `.cargo-ok` file that is empty or otherwise corrupted. + // Either this was extracted by a version of Rust before 1.34, + // in which case everything should be fine. However, an empty + // file created by versions 1.36 to 1.49 indicates that the + // extraction was interrupted and that we need to start again. + // + // Another possibility is that the filesystem is simply + // corrupted, in which case deleting the directory might be + // the safe thing to do. That is probably unlikely, though. + // + // To be safe, this deletes the directory and starts over + // again. + log::warn!("unexpected length of {path:?}, clearing cache"); + paths::remove_dir_all(dst.as_path_unlocked())?; + } + Err(e) if e.kind() == io::ErrorKind::NotFound => {} + Err(e) => anyhow::bail!("failed to access package completion {path:?}: {e}"), + } + dst.create_dir()?; + let mut tar = { + let size_limit = max_unpack_size(self.config, tarball.metadata()?.len()); + let gz = GzDecoder::new(tarball); + let gz = LimitErrorReader::new(gz, size_limit); + Archive::new(gz) + }; + let prefix = unpack_dir.file_name().unwrap(); + let parent = unpack_dir.parent().unwrap(); + for entry in tar.entries()? { + let mut entry = entry.with_context(|| "failed to iterate over archive")?; + let entry_path = entry + .path() + .with_context(|| "failed to read entry path")? + .into_owned(); + + // We're going to unpack this tarball into the global source + // directory, but we want to make sure that it doesn't accidentally + // (or maliciously) overwrite source code from other crates. Cargo + // itself should never generate a tarball that hits this error, and + // crates.io should also block uploads with these sorts of tarballs, + // but be extra sure by adding a check here as well. + if !entry_path.starts_with(prefix) { + anyhow::bail!( + "invalid tarball downloaded, contains \ + a file at {:?} which isn't under {:?}", + entry_path, + prefix + ) + } + // Prevent unpacking the lockfile from the crate itself. + if entry_path + .file_name() + .map_or(false, |p| p == PACKAGE_SOURCE_LOCK) + { + continue; + } + // Unpacking failed + let mut result = entry.unpack_in(parent).map_err(anyhow::Error::from); + if cfg!(windows) && restricted_names::is_windows_reserved_path(&entry_path) { + result = result.with_context(|| { + format!( + "`{}` appears to contain a reserved Windows path, \ + it cannot be extracted on Windows", + entry_path.display() + ) + }); + } + result + .with_context(|| format!("failed to unpack entry at `{}`", entry_path.display()))?; + } + + // Now that we've finished unpacking, create and write to the lock file to indicate that + // unpacking was successful. + let mut ok = OpenOptions::new() + .create_new(true) + .read(true) + .write(true) + .open(&path) + .with_context(|| format!("failed to open `{}`", path.display()))?; + write!(ok, "ok")?; + + Ok(unpack_dir.to_path_buf()) + } + + fn get_pkg(&mut self, package: PackageId, path: &File) -> CargoResult<Package> { + let path = self + .unpack_package(package, path) + .with_context(|| format!("failed to unpack package `{}`", package))?; + let mut src = PathSource::new(&path, self.source_id, self.config); + src.update()?; + let mut pkg = match src.download(package)? { + MaybePackage::Ready(pkg) => pkg, + MaybePackage::Download { .. } => unreachable!(), + }; + + // After we've loaded the package configure its summary's `checksum` + // field with the checksum we know for this `PackageId`. + let req = OptVersionReq::exact(package.version()); + let summary_with_cksum = self + .index + .summaries(package.name(), &req, &mut *self.ops)? + .expect("a downloaded dep now pending!?") + .map(|s| s.summary.clone()) + .next() + .expect("summary not found"); + if let Some(cksum) = summary_with_cksum.checksum() { + pkg.manifest_mut() + .summary_mut() + .set_checksum(cksum.to_string()); + } + + Ok(pkg) + } +} + +impl<'cfg> Source for RegistrySource<'cfg> { + fn query( + &mut self, + dep: &Dependency, + kind: QueryKind, + f: &mut dyn FnMut(Summary), + ) -> Poll<CargoResult<()>> { + // If this is a precise dependency, then it came from a lock file and in + // theory the registry is known to contain this version. If, however, we + // come back with no summaries, then our registry may need to be + // updated, so we fall back to performing a lazy update. + if kind == QueryKind::Exact && dep.source_id().precise().is_some() && !self.ops.is_updated() + { + debug!("attempting query without update"); + let mut called = false; + let pend = + self.index + .query_inner(dep, &mut *self.ops, &self.yanked_whitelist, &mut |s| { + if dep.matches(&s) { + called = true; + f(s); + } + })?; + if pend.is_pending() { + return Poll::Pending; + } + if called { + return Poll::Ready(Ok(())); + } else { + debug!("falling back to an update"); + self.invalidate_cache(); + return Poll::Pending; + } + } + + self.index + .query_inner(dep, &mut *self.ops, &self.yanked_whitelist, &mut |s| { + let matched = match kind { + QueryKind::Exact => dep.matches(&s), + QueryKind::Fuzzy => true, + }; + if matched { + f(s); + } + }) + } + + fn supports_checksums(&self) -> bool { + true + } + + fn requires_precise(&self) -> bool { + false + } + + fn source_id(&self) -> SourceId { + self.source_id + } + + fn invalidate_cache(&mut self) { + self.index.clear_summaries_cache(); + self.ops.invalidate_cache(); + } + + fn download(&mut self, package: PackageId) -> CargoResult<MaybePackage> { + let hash = loop { + match self.index.hash(package, &mut *self.ops)? { + Poll::Pending => self.block_until_ready()?, + Poll::Ready(hash) => break hash, + } + }; + match self.ops.download(package, hash)? { + MaybeLock::Ready(file) => self.get_pkg(package, &file).map(MaybePackage::Ready), + MaybeLock::Download { + url, + descriptor, + authorization, + } => Ok(MaybePackage::Download { + url, + descriptor, + authorization, + }), + } + } + + fn finish_download(&mut self, package: PackageId, data: Vec<u8>) -> CargoResult<Package> { + let hash = loop { + match self.index.hash(package, &mut *self.ops)? { + Poll::Pending => self.block_until_ready()?, + Poll::Ready(hash) => break hash, + } + }; + let file = self.ops.finish_download(package, hash, &data)?; + self.get_pkg(package, &file) + } + + fn fingerprint(&self, pkg: &Package) -> CargoResult<String> { + Ok(pkg.package_id().version().to_string()) + } + + fn describe(&self) -> String { + self.source_id.display_index() + } + + fn add_to_yanked_whitelist(&mut self, pkgs: &[PackageId]) { + self.yanked_whitelist.extend(pkgs); + } + + fn is_yanked(&mut self, pkg: PackageId) -> Poll<CargoResult<bool>> { + self.index.is_yanked(pkg, &mut *self.ops) + } + + fn block_until_ready(&mut self) -> CargoResult<()> { + // Before starting to work on the registry, make sure that + // `<cargo_home>/registry` is marked as excluded from indexing and + // backups. Older versions of Cargo didn't do this, so we do it here + // regardless of whether `<cargo_home>` exists. + // + // This does not use `create_dir_all_excluded_from_backups_atomic` for + // the same reason: we want to exclude it even if the directory already + // exists. + // + // IO errors in creating and marking it are ignored, e.g. in case we're on a + // read-only filesystem. + let registry_base = self.config.registry_base_path(); + let _ = registry_base.create_dir(); + exclude_from_backups_and_indexing(®istry_base.into_path_unlocked()); + + self.ops.block_until_ready() + } +} + +/// Get the maximum upack size that Cargo permits +/// based on a given `size of your compressed file. +/// +/// Returns the larger one between `size * max compression ratio` +/// and a fixed max unpacked size. +/// +/// In reality, the compression ratio usually falls in the range of 2:1 to 10:1. +/// We choose 20:1 to cover almost all possible cases hopefully. +/// Any ratio higher than this is considered as a zip bomb. +/// +/// In the future we might want to introduce a configurable size. +/// +/// Some of the real world data from common compression algorithms: +/// +/// * <https://www.zlib.net/zlib_tech.html> +/// * <https://cran.r-project.org/web/packages/brotli/vignettes/brotli-2015-09-22.pdf> +/// * <https://blog.cloudflare.com/results-experimenting-brotli/> +/// * <https://tukaani.org/lzma/benchmarks.html> +fn max_unpack_size(config: &Config, size: u64) -> u64 { + const SIZE_VAR: &str = "__CARGO_TEST_MAX_UNPACK_SIZE"; + const RATIO_VAR: &str = "__CARGO_TEST_MAX_UNPACK_RATIO"; + let max_unpack_size = if cfg!(debug_assertions) && config.get_env(SIZE_VAR).is_ok() { + // For integration test only. + config + .get_env(SIZE_VAR) + .unwrap() + .parse() + .expect("a max unpack size in bytes") + } else { + MAX_UNPACK_SIZE + }; + let max_compression_ratio = if cfg!(debug_assertions) && config.get_env(RATIO_VAR).is_ok() { + // For integration test only. + config + .get_env(RATIO_VAR) + .unwrap() + .parse() + .expect("a max compression ratio in bytes") + } else { + MAX_COMPRESSION_RATIO + }; + + u64::max(max_unpack_size, size * max_compression_ratio as u64) +} + +fn make_dep_prefix(name: &str) -> String { + match name.len() { + 1 => String::from("1"), + 2 => String::from("2"), + 3 => format!("3/{}", &name[..1]), + _ => format!("{}/{}", &name[0..2], &name[2..4]), + } +} + +#[cfg(test)] +mod tests { + use super::make_dep_prefix; + + #[test] + fn dep_prefix() { + assert_eq!(make_dep_prefix("a"), "1"); + assert_eq!(make_dep_prefix("ab"), "2"); + assert_eq!(make_dep_prefix("abc"), "3/a"); + assert_eq!(make_dep_prefix("Abc"), "3/A"); + assert_eq!(make_dep_prefix("AbCd"), "Ab/Cd"); + assert_eq!(make_dep_prefix("aBcDe"), "aB/cD"); + } +} diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs new file mode 100644 index 0000000..aa0ec90 --- /dev/null +++ b/src/cargo/sources/registry/remote.rs @@ -0,0 +1,358 @@ +use crate::core::{GitReference, PackageId, SourceId}; +use crate::sources::git; +use crate::sources::registry::download; +use crate::sources::registry::MaybeLock; +use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData}; +use crate::util::errors::CargoResult; +use crate::util::interning::InternedString; +use crate::util::{Config, Filesystem}; +use anyhow::Context as _; +use cargo_util::paths; +use lazycell::LazyCell; +use log::{debug, trace}; +use std::cell::{Cell, Ref, RefCell}; +use std::fs::File; +use std::mem; +use std::path::Path; +use std::str; +use std::task::{ready, Poll}; + +/// A remote registry is a registry that lives at a remote URL (such as +/// crates.io). The git index is cloned locally, and `.crate` files are +/// downloaded as needed and cached locally. +pub struct RemoteRegistry<'cfg> { + index_path: Filesystem, + /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/path/$REG-HASH`). + cache_path: Filesystem, + source_id: SourceId, + index_git_ref: GitReference, + config: &'cfg Config, + tree: RefCell<Option<git2::Tree<'static>>>, + repo: LazyCell<git2::Repository>, + head: Cell<Option<git2::Oid>>, + current_sha: Cell<Option<InternedString>>, + needs_update: bool, // Does this registry need to be updated? +} + +impl<'cfg> RemoteRegistry<'cfg> { + pub fn new(source_id: SourceId, config: &'cfg Config, name: &str) -> RemoteRegistry<'cfg> { + RemoteRegistry { + index_path: config.registry_index_path().join(name), + cache_path: config.registry_cache_path().join(name), + source_id, + config, + // TODO: we should probably make this configurable + index_git_ref: GitReference::DefaultBranch, + tree: RefCell::new(None), + repo: LazyCell::new(), + head: Cell::new(None), + current_sha: Cell::new(None), + needs_update: false, + } + } + + fn repo(&self) -> CargoResult<&git2::Repository> { + self.repo.try_borrow_with(|| { + let path = self.config.assert_package_cache_locked(&self.index_path); + + // Fast path without a lock + if let Ok(repo) = git2::Repository::open(&path) { + trace!("opened a repo without a lock"); + return Ok(repo); + } + + // Ok, now we need to lock and try the whole thing over again. + trace!("acquiring registry index lock"); + match git2::Repository::open(&path) { + Ok(repo) => Ok(repo), + Err(_) => { + drop(paths::remove_dir_all(&path)); + paths::create_dir_all(&path)?; + + // Note that we'd actually prefer to use a bare repository + // here as we're not actually going to check anything out. + // All versions of Cargo, though, share the same CARGO_HOME, + // so for compatibility with older Cargo which *does* do + // checkouts we make sure to initialize a new full + // repository (not a bare one). + // + // We should change this to `init_bare` whenever we feel + // like enough time has passed or if we change the directory + // that the folder is located in, such as by changing the + // hash at the end of the directory. + // + // Note that in the meantime we also skip `init.templatedir` + // as it can be misconfigured sometimes or otherwise add + // things that we don't want. + let mut opts = git2::RepositoryInitOptions::new(); + opts.external_template(false); + Ok(git2::Repository::init_opts(&path, &opts).with_context(|| { + format!("failed to initialize index git repository (in {:?})", path) + })?) + } + } + }) + } + + fn head(&self) -> CargoResult<git2::Oid> { + if self.head.get().is_none() { + let repo = self.repo()?; + let oid = self.index_git_ref.resolve(repo)?; + self.head.set(Some(oid)); + } + Ok(self.head.get().unwrap()) + } + + fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> { + { + let tree = self.tree.borrow(); + if tree.is_some() { + return Ok(Ref::map(tree, |s| s.as_ref().unwrap())); + } + } + let repo = self.repo()?; + let commit = repo.find_commit(self.head()?)?; + let tree = commit.tree()?; + + // Unfortunately in libgit2 the tree objects look like they've got a + // reference to the repository object which means that a tree cannot + // outlive the repository that it came from. Here we want to cache this + // tree, though, so to accomplish this we transmute it to a static + // lifetime. + // + // Note that we don't actually hand out the static lifetime, instead we + // only return a scoped one from this function. Additionally the repo + // we loaded from (above) lives as long as this object + // (`RemoteRegistry`) so we then just need to ensure that the tree is + // destroyed first in the destructor, hence the destructor on + // `RemoteRegistry` below. + let tree = unsafe { mem::transmute::<git2::Tree<'_>, git2::Tree<'static>>(tree) }; + *self.tree.borrow_mut() = Some(tree); + Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap())) + } + + fn current_version(&self) -> Option<InternedString> { + if let Some(sha) = self.current_sha.get() { + return Some(sha); + } + let sha = InternedString::new(&self.head().ok()?.to_string()); + self.current_sha.set(Some(sha)); + Some(sha) + } + + fn is_updated(&self) -> bool { + self.config.updated_sources().contains(&self.source_id) + } + + fn mark_updated(&self) { + self.config.updated_sources().insert(self.source_id); + } +} + +const LAST_UPDATED_FILE: &str = ".last-updated"; + +impl<'cfg> RegistryData for RemoteRegistry<'cfg> { + fn prepare(&self) -> CargoResult<()> { + self.repo()?; // create intermediate dirs and initialize the repo + Ok(()) + } + + fn index_path(&self) -> &Filesystem { + &self.index_path + } + + fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path { + self.config.assert_package_cache_locked(path) + } + + // `index_version` Is a string representing the version of the file used to construct the cached copy. + // Older versions of Cargo used the single value of the hash of the HEAD commit as a `index_version`. + // This is technically correct but a little too conservative. If a new commit is fetched all cached + // files need to be regenerated even if a particular file was not changed. + // However if an old cargo has written such a file we still know how to read it, as long as we check for that hash value. + // + // Cargo now uses a hash of the file's contents as provided by git. + fn load( + &mut self, + _root: &Path, + path: &Path, + index_version: Option<&str>, + ) -> Poll<CargoResult<LoadResponse>> { + if self.needs_update { + return Poll::Pending; + } + // Check if the cache is valid. + let git_commit_hash = self.current_version(); + if index_version.is_some() && index_version == git_commit_hash.as_deref() { + // This file was written by an old version of cargo, but it is still up-to-date. + return Poll::Ready(Ok(LoadResponse::CacheValid)); + } + // Note that the index calls this method and the filesystem is locked + // in the index, so we don't need to worry about an `update_index` + // happening in a different process. + fn load_helper( + registry: &RemoteRegistry<'_>, + path: &Path, + index_version: Option<&str>, + ) -> CargoResult<LoadResponse> { + let repo = registry.repo()?; + let tree = registry.tree()?; + let entry = tree.get_path(path); + let entry = entry?; + let git_file_hash = Some(entry.id().to_string()); + + // Check if the cache is valid. + if index_version.is_some() && index_version == git_file_hash.as_deref() { + return Ok(LoadResponse::CacheValid); + } + + let object = entry.to_object(repo)?; + let blob = match object.as_blob() { + Some(blob) => blob, + None => anyhow::bail!("path `{}` is not a blob in the git repo", path.display()), + }; + + Ok(LoadResponse::Data { + raw_data: blob.content().to_vec(), + index_version: git_file_hash, + }) + } + + match load_helper(&self, path, index_version) { + Ok(result) => Poll::Ready(Ok(result)), + Err(_) if !self.is_updated() => { + // If git returns an error and we haven't updated the repo, return + // pending to allow an update to try again. + self.needs_update = true; + Poll::Pending + } + Err(e) + if e.downcast_ref::<git2::Error>() + .map(|e| e.code() == git2::ErrorCode::NotFound) + .unwrap_or_default() => + { + // The repo has been updated and the file does not exist. + Poll::Ready(Ok(LoadResponse::NotFound)) + } + Err(e) => Poll::Ready(Err(e)), + } + } + + fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> { + debug!("loading config"); + self.prepare()?; + self.config.assert_package_cache_locked(&self.index_path); + match ready!(self.load(Path::new(""), Path::new("config.json"), None)?) { + LoadResponse::Data { raw_data, .. } => { + trace!("config loaded"); + let mut cfg: RegistryConfig = serde_json::from_slice(&raw_data)?; + if !self.config.cli_unstable().registry_auth { + cfg.auth_required = false; + } + Poll::Ready(Ok(Some(cfg))) + } + _ => Poll::Ready(Ok(None)), + } + } + + fn block_until_ready(&mut self) -> CargoResult<()> { + if !self.needs_update { + return Ok(()); + } + + self.needs_update = false; + + // Make sure the index is only updated once per session since it is an + // expensive operation. This generally only happens when the resolver + // is run multiple times, such as during `cargo publish`. + if self.is_updated() { + return Ok(()); + } + self.mark_updated(); + + if self.config.offline() { + return Ok(()); + } + if self.config.cli_unstable().no_index_update { + return Ok(()); + } + + debug!("updating the index"); + + // Ensure that we'll actually be able to acquire an HTTP handle later on + // once we start trying to download crates. This will weed out any + // problems with `.cargo/config` configuration related to HTTP. + // + // This way if there's a problem the error gets printed before we even + // hit the index, which may not actually read this configuration. + self.config.http()?; + + self.prepare()?; + self.head.set(None); + *self.tree.borrow_mut() = None; + self.current_sha.set(None); + let path = self.config.assert_package_cache_locked(&self.index_path); + self.config + .shell() + .status("Updating", self.source_id.display_index())?; + + // Fetch the latest version of our `index_git_ref` into the index + // checkout. + let url = self.source_id.url(); + let repo = self.repo.borrow_mut().unwrap(); + git::fetch(repo, url.as_str(), &self.index_git_ref, self.config) + .with_context(|| format!("failed to fetch `{}`", url))?; + + // Create a dummy file to record the mtime for when we updated the + // index. + paths::create(&path.join(LAST_UPDATED_FILE))?; + + Ok(()) + } + + fn invalidate_cache(&mut self) { + // To fully invalidate, undo `mark_updated`s work + self.needs_update = true; + } + + fn is_updated(&self) -> bool { + self.is_updated() + } + + fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> { + let registry_config = loop { + match self.config()? { + Poll::Pending => self.block_until_ready()?, + Poll::Ready(cfg) => break cfg.unwrap(), + } + }; + + download::download( + &self.cache_path, + &self.config, + pkg, + checksum, + registry_config, + ) + } + + fn finish_download( + &mut self, + pkg: PackageId, + checksum: &str, + data: &[u8], + ) -> CargoResult<File> { + download::finish_download(&self.cache_path, &self.config, pkg, checksum, data) + } + + fn is_crate_downloaded(&self, pkg: PackageId) -> bool { + download::is_crate_downloaded(&self.cache_path, &self.config, pkg) + } +} + +impl<'cfg> Drop for RemoteRegistry<'cfg> { + fn drop(&mut self) { + // Just be sure to drop this before our other fields + self.tree.borrow_mut().take(); + } +} |