11use scraper:: { Html , Selector } ;
2- use std:: { fs:: { self , File , create_dir_all} , io:: Write , path:: PathBuf } ; // Added PathBuf
2+ use std:: { collections :: HashMap , fs:: { self , File , create_dir_all} , io:: Write , path:: PathBuf } ; // Added PathBuf, HashMap
33use cargo:: core:: resolver:: features:: CliFeatures ;
44// use cargo::core::SourceId; // Removed unused import
55// use cargo::util::Filesystem; // Removed unused import
@@ -25,8 +25,7 @@ pub enum DocLoaderError {
2525 TempDirCreationFailed ( std:: io:: Error ) ,
2626 #[ error( "Cargo library error: {0}" ) ]
2727 CargoLib ( #[ from] AnyhowError ) , // Re-add CargoLib variant
28- #[ error( "Failed to strip prefix '{prefix}' from path '{path}': {source}" ) ] // Improved error
29- StripPrefix { prefix : PathBuf , path : PathBuf , source : std:: path:: StripPrefixError } ,
28+ // Removed unused StripPrefix variant
3029}
3130
3231// Simple struct to hold document content, maybe add path later if needed
@@ -36,6 +35,7 @@ pub struct Document {
3635 pub content : String ,
3736}
3837
38+
3939/// Generates documentation for a given crate in a temporary directory,
4040/// then loads and parses the HTML documents.
4141/// Extracts text content from the main content area of rustdoc generated HTML.
@@ -44,23 +44,12 @@ pub fn load_documents(
4444 crate_version_req : & str ,
4545 features : Option < & Vec < String > > , // Add optional features parameter
4646) -> Result < Vec < Document > , DocLoaderError > {
47- eprintln ! (
48- "[DEBUG] load_documents called with crate_name: '{}', crate_version_req: '{}', features: {:?}" ,
49- crate_name, crate_version_req, features
50- ) ;
5147 let mut documents = Vec :: new ( ) ;
5248
5349 let temp_dir = tempdir ( ) . map_err ( DocLoaderError :: TempDirCreationFailed ) ?;
5450 let temp_dir_path = temp_dir. path ( ) ;
5551 let temp_manifest_path = temp_dir_path. join ( "Cargo.toml" ) ;
5652
57- eprintln ! (
58- "Generating documentation for crate '{}' (Version Req: '{}', Features: {:?}) in temporary directory: {}" ,
59- crate_name,
60- crate_version_req,
61- features, // Log features
62- temp_dir_path. display( )
63- ) ;
6453
6554 // Create a temporary Cargo.toml using the version requirement and features
6655 let features_string = features
@@ -159,7 +148,6 @@ edition = "2021"
159148
160149 let docs_path = match ( found_count, target_docs_path) {
161150 ( 1 , Some ( path) ) => {
162- eprintln ! ( "[DEBUG] Confirmed unique documentation directory: {}" , path. display( ) ) ;
163151 path
164152 } ,
165153 ( 0 , _) => {
@@ -184,57 +172,143 @@ edition = "2021"
184172 let content_selector = Selector :: parse ( "section#main-content.content" )
185173 . map_err ( |e| DocLoaderError :: Selector ( e. to_string ( ) ) ) ?;
186174
187- for entry in WalkDir :: new ( & docs_path)
175+ // --- Collect all HTML file paths first ---
176+ let all_html_paths: Vec < PathBuf > = WalkDir :: new ( & docs_path)
188177 . into_iter ( )
189- . filter_map ( Result :: ok) // Ignore errors during iteration for now
190- . filter ( |e| !e. file_type ( ) . is_dir ( ) && e. path ( ) . extension ( ) . is_some_and ( |ext| ext == "html" ) )
191- {
192- let path = entry. path ( ) ;
193- // Calculate path relative to the docs_path root
194- let relative_path = path. strip_prefix ( & docs_path) . map_err ( |e| {
195- // Provide more context in the error message using the new error variant
196- DocLoaderError :: StripPrefix {
197- prefix : docs_path. to_path_buf ( ) ,
198- path : path. to_path_buf ( ) ,
199- source : e,
178+ . filter_map ( Result :: ok) // Ignore errors during iteration
179+ . filter ( |e| {
180+ !e. file_type ( ) . is_dir ( ) && e. path ( ) . extension ( ) . is_some_and ( |ext| ext == "html" )
181+ } )
182+ . map ( |e| e. into_path ( ) ) // Get the PathBuf
183+ . collect ( ) ;
184+
185+ eprintln ! ( "[DEBUG] Found {} total HTML files initially." , all_html_paths. len( ) ) ;
186+
187+ // --- Group files by basename ---
188+ let mut basename_groups: HashMap < String , Vec < PathBuf > > = HashMap :: new ( ) ;
189+ for path in all_html_paths {
190+ if let Some ( filename_osstr) = path. file_name ( ) {
191+ if let Some ( filename_str) = filename_osstr. to_str ( ) {
192+ basename_groups
193+ . entry ( filename_str. to_string ( ) )
194+ . or_default ( )
195+ . push ( path) ;
196+ } else {
197+ eprintln ! ( "[WARN] Skipping file with non-UTF8 name: {}" , path. display( ) ) ;
200198 }
201- } ) ?;
202- let path_str = relative_path. to_string_lossy ( ) . to_string ( ) ; // Use the relative path
203- // eprintln!("Processing file: {} (relative: {})", path.display(), path_str); // Updated debug log
199+ } else {
200+ eprintln ! ( "[WARN] Skipping file with no name: {}" , path. display( ) ) ;
201+ }
202+ }
203+
204+ // --- Initialize paths_to_process and explicitly add the root index.html if it exists ---
205+ let mut paths_to_process: Vec < PathBuf > = Vec :: new ( ) ;
206+ let root_index_path = docs_path. join ( "index.html" ) ;
207+ if root_index_path. is_file ( ) {
208+ paths_to_process. push ( root_index_path) ;
209+ }
210+
211+ // --- Filter based on duplicates and size ---
212+ // NOTE: Initialization of paths_to_process moved before this loop
213+ for ( basename, mut paths) in basename_groups {
214+ // Always ignore index.html at this stage (except the root one added earlier)
215+ if basename == "index.html" {
216+ continue ;
217+ }
218+
219+ // Also ignore files within source code view directories
220+ // Check the first path (they should share the problematic component if any)
221+ if paths. first ( ) . map_or ( false , |p| p. components ( ) . any ( |comp| comp. as_os_str ( ) == "src" ) ) {
222+ continue ;
223+ }
224+
225+
226+ if paths. len ( ) == 1 {
227+ // Single file with this basename (and not index.html), keep it
228+ paths_to_process. push ( paths. remove ( 0 ) ) ;
229+ } else {
230+ // Multiple files with the same basename (duplicates)
231+ // Find the largest one by file size
232+ // Explicit type annotation needed for the error type in try_fold
233+ let largest_path_result: Result < Option < ( PathBuf , u64 ) > , std:: io:: Error > = paths. into_iter ( ) . try_fold ( None :: < ( PathBuf , u64 ) > , |largest, current| {
234+ let current_meta = fs:: metadata ( & current) ?;
235+ let current_size = current_meta. len ( ) ;
236+ match largest {
237+ None => Ok ( Some ( ( current, current_size) ) ) ,
238+ Some ( ( largest_path_so_far, largest_size_so_far) ) => {
239+ if current_size > largest_size_so_far {
240+ Ok ( Some ( ( current, current_size) ) )
241+ } else {
242+ Ok ( Some ( ( largest_path_so_far, largest_size_so_far) ) )
243+ }
244+ }
245+ }
246+ } ) ;
247+
248+ match largest_path_result {
249+ Ok ( Some ( ( p, _size) ) ) => {
250+ // eprintln!("[DEBUG] Duplicate basename '{}': Keeping largest file {}", basename, p.display());
251+ paths_to_process. push ( p) ;
252+ }
253+ Ok ( None ) => {
254+ // This case should ideally not happen if the input `paths` was not empty,
255+ // but handle it defensively.
256+ eprintln ! ( "[WARN] No files found for basename '{}' during size comparison." , basename) ;
257+ }
258+ Err ( e) => {
259+ eprintln ! ( "[WARN] Error getting metadata for basename '{}', skipping: {}" , basename, e) ;
260+ // Decide if you want to skip the whole group or handle differently
261+ }
262+ }
263+ }
264+ }
265+
266+ eprintln ! ( "[DEBUG] Filtered down to {} files to process." , paths_to_process. len( ) ) ;
267+
268+
269+ // --- Process the filtered list of files ---
270+ for path in paths_to_process {
271+ // Calculate path relative to the docs_path root
272+ let relative_path = match path. strip_prefix ( & docs_path) {
273+ Ok ( p) => p. to_path_buf ( ) ,
274+ Err ( e) => {
275+ eprintln ! ( "[WARN] Failed to strip prefix {} from {}: {}" , docs_path. display( ) , path. display( ) , e) ;
276+ continue ; // Skip if path manipulation fails
277+ }
278+ } ;
279+ let path_str = relative_path. to_string_lossy ( ) . to_string ( ) ;
204280
205- // eprintln!(" Reading file content..."); // Added
206- let html_content = fs:: read_to_string ( path) ?; // Still read from the absolute path
207- // eprintln!(" Parsing HTML..."); // Added
281+ let html_content = match fs:: read_to_string ( & path) { // Read from the absolute path
282+ Ok ( content) => content,
283+ Err ( e) => {
284+ eprintln ! ( "[WARN] Failed to read file {}: {}" , path. display( ) , e) ;
285+ continue ; // Skip this file if reading fails
286+ }
287+ } ;
208288
209- // Parse the HTML document
210289 let document = Html :: parse_document ( & html_content) ;
211290
212- // Select the main content element
213291 if let Some ( main_content_element) = document. select ( & content_selector) . next ( ) {
214- // Extract all text nodes within the main content
215- // eprintln!(" Extracting text content..."); // Added
216292 let text_content: String = main_content_element
217293 . text ( )
218294 . map ( |s| s. trim ( ) )
219295 . filter ( |s| !s. is_empty ( ) )
220296 . collect :: < Vec < & str > > ( )
221- . join ( "\n " ) ; // Join text nodes with newlines
297+ . join ( "\n " ) ;
222298
223299 if !text_content. is_empty ( ) {
224- // eprintln!(" Extracted content ({} chars)", text_content.len()); // Uncommented and simplified
225300 documents. push ( Document {
226301 path : path_str,
227302 content : text_content,
228303 } ) ;
229304 } else {
230- // eprintln!("No text content found in main section for: {}", path.display()); // Verbose logging
305+ // eprintln!("[DEBUG] No text content found in main section for: {}", path.display());
231306 }
232307 } else {
233- // eprintln!("'main-content' selector not found for: {}", path.display()); // Verbose logging
234- // Optionally handle files without the main content selector differently
308+ // eprintln!("[DEBUG] 'main-content' selector not found for: {}", path.display());
235309 }
236310 }
237311
238- eprintln ! ( "Finished document loading. Found {} documents." , documents. len( ) ) ;
312+ eprintln ! ( "Finished document loading. Found {} final documents." , documents. len( ) ) ;
239313 Ok ( documents)
240314}
0 commit comments