Skip to content

Commit fc853d6

Browse files
committed
Fix Windows absolute path parsing and remove HTTP assumption
This commit fixes issue #972 where Windows absolute paths like C:\path were incorrectly parsed as URLs with scheme C:. Key changes: - Added WindowsPath newtype with proper detection using pattern matching - Moved Windows path logic to separate submodule for better organization - Removed automatic HTTP assumption (foo -> http://foo/) - Added InvalidInput error type with helpful error messages - Updated all tests to reflect new behavior Fixes #972
1 parent dab8952 commit fc853d6

File tree

4 files changed

+123
-21
lines changed

4 files changed

+123
-21
lines changed

lychee-lib/src/types/error.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ pub enum ErrorKind {
116116
#[error("Invalid file path: {0}")]
117117
InvalidFile(PathBuf),
118118

119+
/// The given input is neither a valid file path nor a valid URL
120+
#[error("{0}")]
121+
InvalidInput(String),
122+
119123
/// Error while traversing an input directory
120124
#[error("Cannot traverse input directory: {0}")]
121125
DirTraversal(#[from] ignore::Error),
@@ -340,6 +344,7 @@ impl ErrorKind {
340344
ErrorKind::InvalidIndexFile(_path) => Some(
341345
"Index file not found in directory. Check if index.html or other index files exist".to_string()
342346
),
347+
ErrorKind::InvalidInput(_) => None, // Error message is already in the error itself
343348
}
344349
}
345350

@@ -410,6 +415,7 @@ impl PartialEq for ErrorKind {
410415
}
411416
(Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2,
412417
(Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2,
418+
(Self::InvalidInput(s1), Self::InvalidInput(s2)) => s1 == s2,
413419
(Self::InvalidFilePath(u1), Self::InvalidFilePath(u2)) => u1 == u2,
414420
(Self::InvalidFragment(u1), Self::InvalidFragment(u2)) => u1 == u2,
415421
(Self::InvalidIndexFile(p1), Self::InvalidIndexFile(p2)) => p1 == p2,
@@ -445,6 +451,7 @@ impl Hash for ErrorKind {
445451
Self::InvalidGithubUrl(s) => s.hash(state),
446452
Self::DirTraversal(e) => e.to_string().hash(state),
447453
Self::InvalidFile(e) => e.to_string_lossy().hash(state),
454+
Self::InvalidInput(s) => s.hash(state),
448455
Self::EmptyUrl => "Empty URL".hash(state),
449456
Self::ParseUrl(e, s) => (e.to_string(), s).hash(state),
450457
Self::InvalidURI(u) => u.hash(state),

lychee-lib/src/types/input/input.rs

Lines changed: 85 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use super::InputResolver;
77
use super::content::InputContent;
88
use super::source::InputSource;
99
use super::source::ResolvedInputSource;
10+
use super::windows_path::WindowsPath;
1011
use crate::filter::PathExcludes;
1112
use crate::types::FileType;
1213
use crate::types::file::FileExtensions;
@@ -53,12 +54,18 @@ impl Input {
5354
) -> Result<Self> {
5455
let source = if input == STDIN {
5556
InputSource::Stdin
57+
} else if let Some(windows_path) = WindowsPath::try_from(input) {
58+
// Handle Windows absolute paths (e.g., C:\path) before URL parsing
59+
let path = windows_path.as_path();
60+
if path.exists() {
61+
InputSource::FsPath(path.to_path_buf())
62+
} else {
63+
return Err(ErrorKind::InvalidFile(path.to_path_buf()));
64+
}
5665
} else {
5766
// We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
5867
match Url::parse(input) {
59-
// Weed out non-HTTP schemes, including Windows drive
60-
// specifiers, which can be parsed by the
61-
// [url](https://crates.io/crates/url) crate
68+
// Only accept HTTP and HTTPS URLs
6269
Ok(url) if url.scheme() == "http" || url.scheme() == "https" => {
6370
InputSource::RemoteUrl(Box::new(url))
6471
}
@@ -106,19 +113,11 @@ impl Input {
106113
// but it catches the most common ones
107114
return Err(ErrorKind::InvalidFile(path));
108115
} else {
109-
// Invalid path; check if a valid URL can be constructed from the input
110-
// by prefixing it with a `http://` scheme.
111-
//
112-
// Curl also uses http (i.e. not https), see
113-
// https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
114-
//
115-
// TODO: We should get rid of this heuristic and
116-
// require users to provide a full URL with scheme.
117-
// This is a big source of confusion to users.
118-
let url = Url::parse(&format!("http://{input}")).map_err(|e| {
119-
ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string())
120-
})?;
121-
InputSource::RemoteUrl(Box::new(url))
116+
// Input is neither a valid file path nor a URL
117+
return Err(ErrorKind::InvalidInput(format!(
118+
"Input '{input}' not found as file and not a valid URL. \
119+
Use full URL (e.g., https://example.com) or check file path."
120+
)));
122121
}
123122
}
124123
}
@@ -406,7 +405,8 @@ mod tests {
406405

407406
#[test]
408407
fn test_input_handles_real_relative_paths() {
409-
let test_file = "./Cargo.toml";
408+
// Use current directory which should always exist
409+
let test_file = ".";
410410
let path = Path::new(test_file);
411411

412412
assert!(path.exists());
@@ -463,11 +463,13 @@ mod tests {
463463

464464
#[test]
465465
fn test_url_without_scheme() {
466+
// URLs without scheme should now fail with helpful error message
466467
let input = Input::from_value("example.com");
467-
assert_eq!(
468-
input.unwrap().source.to_string(),
469-
String::from("http://example.com/")
470-
);
468+
assert!(matches!(input, Err(ErrorKind::InvalidInput(_))));
469+
470+
if let Err(ErrorKind::InvalidInput(msg)) = input {
471+
assert!(msg.contains("Use full URL"));
472+
}
471473
}
472474

473475
// Ensure that a Windows file path is not mistaken for a URL.
@@ -579,4 +581,66 @@ mod tests {
579581
})
580582
));
581583
}
584+
585+
#[test]
586+
fn test_windows_absolute_path_detection() {
587+
// Valid Windows absolute paths
588+
assert!(WindowsPath::try_from("C:\\").is_some());
589+
assert!(WindowsPath::try_from("C:\\folder").is_some());
590+
assert!(WindowsPath::try_from("D:\\folder\\file.txt").is_some());
591+
assert!(WindowsPath::try_from("Z:/folder/file.txt").is_some());
592+
593+
// Invalid cases
594+
assert!(WindowsPath::try_from("C:").is_none()); // Too short
595+
assert!(WindowsPath::try_from("c:\\").is_none()); // Lowercase
596+
assert!(WindowsPath::try_from("CC:\\").is_none()); // Two letters
597+
assert!(WindowsPath::try_from("C-\\").is_none()); // Not colon
598+
assert!(WindowsPath::try_from("C:file").is_none()); // No separator
599+
assert!(WindowsPath::try_from("https://example.com").is_none()); // URL
600+
assert!(WindowsPath::try_from("./relative").is_none()); // Relative path
601+
}
602+
603+
#[test]
604+
fn test_windows_absolute_path_parsing() {
605+
use std::env::temp_dir;
606+
use tempfile::NamedTempFile;
607+
608+
// Test with existing file (simulated Windows path)
609+
if cfg!(windows) {
610+
let dir = temp_dir();
611+
let file = NamedTempFile::new_in(dir).unwrap();
612+
let path = file.path();
613+
let path_str = path.to_str().unwrap();
614+
615+
// Should parse as FsPath if file exists
616+
let input = Input::from_value(path_str).unwrap();
617+
assert!(matches!(input.source, InputSource::FsPath(_)));
618+
}
619+
}
620+
621+
#[test]
622+
fn test_no_http_assumption() {
623+
// These should now fail instead of being converted to http://
624+
assert!(matches!(
625+
Input::from_value("example.com"),
626+
Err(ErrorKind::InvalidInput(_))
627+
));
628+
assert!(matches!(
629+
Input::from_value("foo"),
630+
Err(ErrorKind::InvalidInput(_))
631+
));
632+
assert!(matches!(
633+
Input::from_value("subdomain.example.com"),
634+
Err(ErrorKind::InvalidInput(_))
635+
));
636+
637+
// Error message should be helpful
638+
if let Err(ErrorKind::InvalidInput(msg)) = Input::from_value("example.com") {
639+
assert!(msg.contains("not found as file"));
640+
assert!(msg.contains("not a valid URL"));
641+
assert!(msg.contains("https://example.com"));
642+
} else {
643+
panic!("Expected InvalidInput error with helpful message");
644+
}
645+
}
582646
}

lychee-lib/src/types/input/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ pub mod input;
2828
pub mod content;
2929
pub mod resolver;
3030
pub mod source;
31+
pub mod windows_path;
3132

3233
pub use content::InputContent;
3334
pub use input::Input;
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//! Windows absolute path handling
2+
//!
3+
//! This module provides utilities for detecting and handling Windows absolute paths
4+
//! to prevent them from being misinterpreted as URLs.
5+
6+
use std::path::Path;
7+
8+
/// A newtype representing a Windows absolute path
9+
#[derive(Debug, Clone, PartialEq, Eq)]
10+
pub struct WindowsPath(String);
11+
12+
impl WindowsPath {
13+
/// Try to parse a string as a Windows absolute path
14+
pub fn try_from(input: &str) -> Option<Self> {
15+
let chars: Vec<char> = input.chars().take(3).collect();
16+
17+
matches!(
18+
chars.as_slice(),
19+
[drive, ':', sep] if drive.is_ascii_uppercase() && matches!(sep, '\\' | '/')
20+
)
21+
.then(|| WindowsPath(input.to_string()))
22+
}
23+
}
24+
25+
impl WindowsPath {
26+
/// Get a reference to the path
27+
pub fn as_path(&self) -> &Path {
28+
Path::new(&self.0)
29+
}
30+
}

0 commit comments

Comments
 (0)