import os import argparse import subprocess import git # Still needed for potential future use or for checking if it's a repo, even if not cloning import shutil import tempfile from pathlib import Path from docx import Document from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern # Explicit import often good practice class CodebaseToText: def __init__(self, input_path, output_path, output_type, verbose, exclude_hidden, ignored_paths=None): # Normalize input path early self.input_path = os.path.abspath(input_path) # Use absolute path for consistency self.output_path = output_path self.output_type = output_type self.verbose = verbose self.exclude_hidden = exclude_hidden self.ignored_paths = ignored_paths if ignored_paths else [] # Store custom ignores self.temp_folder_path = None # Used only if cloning self.is_cloned_repo = False # Flag to track if we cloned self.git_ignore_spec = None self.custom_ignore_spec = None self._initialize_ignores() # Load ignores after setting input_path def _initialize_ignores(self): """Loads .gitignore and initializes custom ignore spec.""" # Load .gitignore relative to the current input_path gitignore_path = os.path.join(self.input_path, ".gitignore") if os.path.exists(gitignore_path): try: with open(gitignore_path, 'r', encoding='utf-8') as f: # Specify encoding lines = f.read().splitlines() # Filter out empty lines and comments lines = [line for line in lines if line.strip() and not line.strip().startswith('#')] if lines: self.git_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, lines) if self.verbose: print(f"Loaded .gitignore rules from: {gitignore_path}") except Exception as e: print(f"Warning: Could not read .gitignore file at {gitignore_path}: {e}") elif self.verbose: print(f"No .gitignore file found at: {gitignore_path}") # Create PathSpec for custom ignored paths if self.ignored_paths: # Filter out empty lines/patterns just in case valid_custom_paths = [p for p in self.ignored_paths if p.strip()] if valid_custom_paths: self.custom_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, valid_custom_paths) if self.verbose: print(f"Using custom ignore rules: {valid_custom_paths}") else: self.ignored_paths = [] # Clear if only contained empty strings def _is_path_ignored(self, file_or_dir_path): """Checks if a given path should be ignored based on all rules.""" try: # Calculate relative path from the project root (self.input_path) # Use pathlib for robustness base_path = Path(self.input_path) target_path = Path(file_or_dir_path) # Use absolute paths temporarily to ensure correct relative calculation rel_path = target_path.relative_to(base_path).as_posix() # Use POSIX paths for pathspec except ValueError: # If the path is not relative to input_path (shouldn't normally happen with os.walk) if self.verbose: print(f"Warning: Path {file_or_dir_path} is not relative to {self.input_path}. Skipping ignore checks for it.") return False # Or decide how to handle this case # Check .gitignore rules if self.git_ignore_spec and self.git_ignore_spec.match_file(rel_path): if self.verbose > 1: # More detailed verbose logging if needed print(f"Ignoring '{rel_path}' (gitignore)") return True # Check custom ignore rules if self.custom_ignore_spec and self.custom_ignore_spec.match_file(rel_path): if self.verbose > 1: print(f"Ignoring '{rel_path}' (custom)") return True # Check if hidden files/dirs should be excluded # Note: PathSpec patterns can also match hidden files (e.g., '.*'), # so this check is primarily for the simple dot/underscore prefix rule. if self.exclude_hidden and self._is_hidden_path_component(target_path): if self.verbose > 1: print(f"Ignoring '{rel_path}' (hidden)") return True return False def _is_hidden_path_component(self, path_obj: Path): """Checks if any component of the path starts with '.' or '__'.""" # Check the name itself and its parents relative to the base input path relative_parts = path_obj.relative_to(self.input_path).parts return any(part.startswith(('.', '__')) for part in relative_parts if part != '.') def _parse_folder(self, folder_path): """Generates the directory tree string, respecting ignore rules.""" tree = "" base_level = folder_path.count(os.sep) # Ensure folder_path is absolute for consistent relative path calculations abs_folder_path = Path(folder_path).resolve() for root, dirs, files in os.walk(abs_folder_path, topdown=True): abs_root_path = Path(root).resolve() # --- Directory Ignore Logic --- # Filter directories *before* recursing into them # Keep track of original dirs list to modify dirs[:] original_dirs = list(dirs) dirs[:] = [] # Clear dirs list, we will re-add ones we want to keep for d in original_dirs: dir_path = abs_root_path / d # Skip .git directory explicitly (essential) if d == ".git": if self.verbose > 1: print(f"Skipping .git directory: {dir_path}") continue if self._is_path_ignored(str(dir_path)): if self.verbose: print(f"Ignoring directory: {dir_path.relative_to(self.input_path)}") # Don't add 'd' back to dirs[:], effectively pruning the walk else: dirs.append(d) # Keep this directory for recursion # --- Calculate Tree Indentation --- try: # Calculate level relative to the *initial* input path for correct indentation rel_root = abs_root_path.relative_to(self.input_path) level = len(rel_root.parts) if rel_root.parts != ('.',) else 0 except ValueError: # Should not happen if os.walk starts within input_path print(f"Warning: Cannot determine relative path for {abs_root_path}. Using level 0.") level = 0 indent = ' ' * level # 4 spaces per level # Add directory entry to tree (only if it's not the root itself processed initially) if abs_root_path != Path(self.input_path).resolve(): # Don't print root '/' tree += f"{indent}{abs_root_path.name}/\n" elif level == 0 and not tree: # Print root marker only once at the start tree += f"{Path(self.input_path).name}/\n" # --- File Listing --- subindent = ' ' * (level + 1) sorted_files = sorted(files) # Sort files for consistent output for f in sorted_files: file_path = abs_root_path / f # Check if file is ignored if not self._is_path_ignored(str(file_path)): tree += f"{subindent}{f}\n" elif self.verbose: # Note: _is_path_ignored already prints detailed reasons if verbose > 1 print(f"Ignoring file (in tree): {file_path.relative_to(self.input_path)}") if self.verbose: print(f"\n--- Generated File Tree ---\n{tree}") print("--- End File Tree ---\n") return tree def _get_file_contents(self, file_path): """Reads file content, handling potential encoding issues.""" try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except UnicodeDecodeError: try: # Try a fallback encoding (e.g., latin-1 or detected encoding) with open(file_path, 'r', encoding='latin-1') as file: if self.verbose: print(f"Warning: Used fallback encoding 'latin-1' for {file_path}") return file.read() except Exception as e: print(f"Error: Could not read file {file_path} with utf-8 or latin-1: {e}") return f"Error reading file: {e}" # Include error message in output except Exception as e: print(f"Error reading file {file_path}: {e}") return f"Error reading file: {e}" def _process_files(self, path_to_walk): """Walks through files and concatenates their content, respecting ignores.""" content = "" base_path = Path(self.input_path).resolve() # Use resolved base path for root, dirs, files in os.walk(path_to_walk, topdown=True): abs_root_path = Path(root).resolve() # --- Directory Pruning (same logic as in _parse_folder) --- original_dirs = list(dirs) dirs[:] = [] for d in original_dirs: dir_path = abs_root_path / d if d == ".git" or self._is_path_ignored(str(dir_path)): continue # Skip ignored or .git dirs else: dirs.append(d) # --- Process Files in Current Directory --- sorted_files = sorted(files) for file_name in sorted_files: file_path = abs_root_path / file_name str_file_path = str(file_path) # Skip ignored files if self._is_path_ignored(str_file_path): if self.verbose: print(f"Ignoring file (content): {file_path.relative_to(base_path)}") continue # Try to get content try: if self.verbose: print(f"Processing: {file_path.relative_to(base_path)}") file_content = self._get_file_contents(str_file_path) rel_file_path_display = file_path.relative_to(base_path).as_posix() # Display relative path content += f"\n\n--- File: {rel_file_path_display} ---\n" # Optional: Add file type hint # content += f"File type: {os.path.splitext(file_name)[1]}\n\n" content += file_content # Use a clear end marker content += f"\n--- End File: {rel_file_path_display} ---\n" except Exception as e: # Catch potential errors during processing print(f"Couldn't process {file_path.relative_to(base_path)}: {e}") content += f"\n\n--- Error processing file: {file_path.relative_to(base_path)} --- \n {e} \n--- End Error ---\n" return content def get_text(self): """Generates the final combined text output.""" # --- Decide whether to clone or use local path --- process_path = self.input_path # Default to local path if self.is_github_repo(): success = self._clone_github_repo() if success: process_path = self.temp_folder_path self.is_cloned_repo = True # Re-initialize ignores for the cloned repo location self.input_path = process_path # Temporarily change base for ignore checks self._initialize_ignores() print(f"Processing cloned repo at: {process_path}") else: print("Error: Failed to clone GitHub repository. Aborting.") # Reset input_path if cloning failed and we modified it if self.is_cloned_repo: self.input_path = os.path.dirname(self.temp_folder_path) # Hacky way to get original path back conceptually return "Error: Could not clone repository." # Return error message else: print(f"Processing local path: {process_path}") # Ensure ignores are initialized for the local path (done in __init__) # --- Generate Structure and Content --- folder_structure = self._parse_folder(process_path) file_contents = self._process_files(process_path) # --- Assemble Final Output --- folder_structure_header = "--- Folder Structure ---" file_contents_header = "--- File Contents ---" delimiter = "=" * 60 # Use a more prominent delimiter # Restore original input_path if it was changed for cloning if self.is_cloned_repo: # This assumes the original input_path wasn't needed after _initialize_ignores # A cleaner way might be to pass the base path explicitly to ignore checkers pass # No need to restore if input_path wasn't critical after cloning return ( f"{folder_structure_header}\n{folder_structure}\n{delimiter}\n\n" f"{file_contents_header}\n{file_contents}\n{delimiter}\nEnd of Codebase\n{delimiter}" ) def get_file(self): """Gets the text and saves it to the specified output file.""" text_content = self.get_text() # Check for error during get_text (e.g., cloning failure) if text_content.startswith("Error:"): print(text_content) # Print the error # Optionally, clean up temp folder even on error self.clean_up_temp_folder() return # Exit without writing file try: # Ensure output directory exists output_dir = os.path.dirname(self.output_path) if output_dir: # Handle case where output is in current dir os.makedirs(output_dir, exist_ok=True) if self.output_type == "txt": with open(self.output_path, "w", encoding='utf-8') as file: file.write(text_content) elif self.output_type == "docx": doc = Document() # Add text respecting paragraphs (simple split, might need refinement) # Consider adding as preformatted text run if python-docx supports it well for paragraph in text_content.split('\n'): doc.add_paragraph(paragraph) doc.save(self.output_path) else: # Should be caught by argparse choices usually, but good to have raise ValueError(f"Invalid output type '{self.output_type}'. Supported types: txt, docx") print(f"\nSuccessfully generated {self.output_type} file: {self.output_path}") except Exception as e: print(f"\nError writing output file {self.output_path}: {e}") finally: # Clean up temp folder regardless of writing success/failure self.clean_up_temp_folder() #### GitHub #### def _clone_github_repo(self): """Clones the repo to a temporary directory.""" try: # Create temp dir *before* cloning into it # Use a more descriptive prefix/suffix if desired self.temp_folder_path = tempfile.mkdtemp(prefix="cbt_repo_") print(f"Cloning {self.input_path} into temporary folder {self.temp_folder_path}...") git.Repo.clone_from(self.input_path, self.temp_folder_path) # Important: Update self.input_path to the temp folder *for processing* # self.input_path = self.temp_folder_path # Now done within get_text if self.verbose: print("GitHub repository cloned successfully.") return True # Indicate success except git.GitCommandError as e: print(f"Error cloning GitHub repository: {e}") # Clean up failed clone attempt self.clean_up_temp_folder() # Ensure cleanup even on clone failure self.temp_folder_path = None # Reset path return False # Indicate failure except Exception as e: print(f"An unexpected error occurred during cloning: {e}") self.clean_up_temp_folder() # Ensure cleanup self.temp_folder_path = None # Reset path return False # Indicate failure def is_github_repo(self): """Checks if the input path looks like a common Git repo URL.""" # Keep it simple, add more patterns if needed return self.input_path.startswith(("https://github.com/", "git@github.com:", "https://gitlab.com/", "git@gitlab.com:", "https://bitbucket.org/", "git@bitbucket.org:")) \ or self.input_path.endswith(".git") # Common convention for clone URLs def clean_up_temp_folder(self): """Removes the temporary folder if it was created.""" if self.temp_folder_path and os.path.exists(self.temp_folder_path): try: shutil.rmtree(self.temp_folder_path) if self.verbose: print(f"Cleaned up temporary folder: {self.temp_folder_path}") self.temp_folder_path = None # Reset path after successful removal except Exception as e: print(f"Warning: Could not remove temporary folder {self.temp_folder_path}: {e}") # --- Main Execution --- def main(): parser = argparse.ArgumentParser( description="Generate a single text or docx file from a codebase, respecting .gitignore and custom ignore rules.", formatter_class=argparse.ArgumentDefaultsHelpFormatter # Show defaults ) parser.add_argument( "input", help="Input path (local folder path or Git repository URL)." ) parser.add_argument( "-o", "--output", required=True, help="Output file path (e.g., my_codebase.txt or output/report.docx)." ) parser.add_argument( "-t", "--output_type", choices=["txt", "docx"], default="txt", help="Output file type." ) parser.add_argument( "--ignore", nargs='*', default=[], # Accept zero or more ignore patterns help="List of additional paths/patterns to ignore (e.g., 'dist/' '/node_modules' '*.log' 'config. Maches relative to the input path root." ) parser.add_argument( "--exclude_hidden", action="store_true", help="Exclude files and folders starting with '.' or '__'. Note that .gitignore or custom ignores might already cover these." ) parser.add_argument( "-v", "--verbose", action="count", default=0, # Use count for verbosity levels (0, 1, 2) help="Increase output verbosity. -v for basic info, -vv for detailed ignore reasons." ) args = parser.parse_args() # Basic validation if not args.input: parser.error("Input path cannot be empty.") if not args.output: parser.error("Output path cannot be empty.") code_to_text = None # Ensure it's defined for finally block try: code_to_text = CodebaseToText( input_path=args.input, output_path=args.output, output_type=args.output_type, verbose=args.verbose, exclude_hidden=args.exclude_hidden, ignored_paths=args.ignore # Pass the list here ) code_to_text.get_file() except Exception as e: print(f"\nAn unexpected error occurred: {e}") # Attempt cleanup even if initialization failed partially if code_to_text: code_to_text.clean_up_temp_folder() # No finally block needed here as get_file() now handles cleanup def _sql_dump(database, export_file): command = [ r'd:\xampp\mysql\bin\mysqldump.exe', '-u', 'root', '--no-data', database ] with open(export_file, 'w') as output_file: result = subprocess.run(command, stdout=output_file, stderr=subprocess.PIPE) if(result.returncode == 0): print(f"Schema dump successful: {export_file}") else: print("Error occurred:", result.stderr.decode()) if __name__ == "__main__": # --- Example Usage (replace with main() for CLI) --- # To run from command line, save the script (e.g., codebase_to_text.py) and run: # python codebase_to_text.py . -o my_project.txt --ignore "dist/" "*.tmp" "/tests/data/" --exclude_hidden -v # python codebase_to_text.py https://github.com/user/repo.git -o repo_code.docx -t docx -vv # --- Direct call example (useful for testing) --- try: print("Running direct example...") # Example: Process current directory, output to output.txt, ignore 'venv' folder and all '.log' files example_ignores = ["venv/", "*.log", "/output.txt", ".git/", "__pycache__/"] # Add common ignores example_ignores.append("public/css/") example_ignores.append("codebase-to-text.py") converter = CodebaseToText( input_path=".", output_path="_codebase_output.txt", output_type="txt", verbose=0, # Set verbosity level (0, 1, or 2) exclude_hidden=True, ignored_paths=example_ignores ) converter.get_file() print("Direct example finished.") _sql_dump('tp_servicedesk', '_codebase_schemafile.sql') except Exception as e: print(f"Error running direct example: {e}") # Uncomment the line below to enable command-line argument parsing when running the script directly # main() # to get a sql dump use somthing similar to # d:\xampp\mysql\bin\mysqldump.exe -u root --no-data tp_servicedesk > _codebase_schemafile.sql