tp_servicedesk/codebase-to-text.py
2025-04-30 02:15:45 +01:00

488 lines
22 KiB
Python

import os
import argparse
import subprocess
import git # Still needed for potential future use or for checking if it's a repo, even if not cloning
import shutil
import tempfile
from pathlib import Path
from docx import Document
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern # Explicit import often good practice
class CodebaseToText:
def __init__(self, input_path, output_path, output_type, verbose, exclude_hidden, ignored_paths=None):
# Normalize input path early
self.input_path = os.path.abspath(input_path) # Use absolute path for consistency
self.output_path = output_path
self.output_type = output_type
self.verbose = verbose
self.exclude_hidden = exclude_hidden
self.ignored_paths = ignored_paths if ignored_paths else [] # Store custom ignores
self.temp_folder_path = None # Used only if cloning
self.is_cloned_repo = False # Flag to track if we cloned
self.git_ignore_spec = None
self.custom_ignore_spec = None
self._initialize_ignores() # Load ignores after setting input_path
def _initialize_ignores(self):
"""Loads .gitignore and initializes custom ignore spec."""
# Load .gitignore relative to the current input_path
gitignore_path = os.path.join(self.input_path, ".gitignore")
if os.path.exists(gitignore_path):
try:
with open(gitignore_path, 'r', encoding='utf-8') as f: # Specify encoding
lines = f.read().splitlines()
# Filter out empty lines and comments
lines = [line for line in lines if line.strip() and not line.strip().startswith('#')]
if lines:
self.git_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, lines)
if self.verbose:
print(f"Loaded .gitignore rules from: {gitignore_path}")
except Exception as e:
print(f"Warning: Could not read .gitignore file at {gitignore_path}: {e}")
elif self.verbose:
print(f"No .gitignore file found at: {gitignore_path}")
# Create PathSpec for custom ignored paths
if self.ignored_paths:
# Filter out empty lines/patterns just in case
valid_custom_paths = [p for p in self.ignored_paths if p.strip()]
if valid_custom_paths:
self.custom_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, valid_custom_paths)
if self.verbose:
print(f"Using custom ignore rules: {valid_custom_paths}")
else:
self.ignored_paths = [] # Clear if only contained empty strings
def _is_path_ignored(self, file_or_dir_path):
"""Checks if a given path should be ignored based on all rules."""
try:
# Calculate relative path from the project root (self.input_path)
# Use pathlib for robustness
base_path = Path(self.input_path)
target_path = Path(file_or_dir_path)
# Use absolute paths temporarily to ensure correct relative calculation
rel_path = target_path.relative_to(base_path).as_posix() # Use POSIX paths for pathspec
except ValueError:
# If the path is not relative to input_path (shouldn't normally happen with os.walk)
if self.verbose:
print(f"Warning: Path {file_or_dir_path} is not relative to {self.input_path}. Skipping ignore checks for it.")
return False # Or decide how to handle this case
# Check .gitignore rules
if self.git_ignore_spec and self.git_ignore_spec.match_file(rel_path):
if self.verbose > 1: # More detailed verbose logging if needed
print(f"Ignoring '{rel_path}' (gitignore)")
return True
# Check custom ignore rules
if self.custom_ignore_spec and self.custom_ignore_spec.match_file(rel_path):
if self.verbose > 1:
print(f"Ignoring '{rel_path}' (custom)")
return True
# Check if hidden files/dirs should be excluded
# Note: PathSpec patterns can also match hidden files (e.g., '.*'),
# so this check is primarily for the simple dot/underscore prefix rule.
if self.exclude_hidden and self._is_hidden_path_component(target_path):
if self.verbose > 1:
print(f"Ignoring '{rel_path}' (hidden)")
return True
return False
def _is_hidden_path_component(self, path_obj: Path):
"""Checks if any component of the path starts with '.' or '__'."""
# Check the name itself and its parents relative to the base input path
relative_parts = path_obj.relative_to(self.input_path).parts
return any(part.startswith(('.', '__')) for part in relative_parts if part != '.')
def _parse_folder(self, folder_path):
"""Generates the directory tree string, respecting ignore rules."""
tree = ""
base_level = folder_path.count(os.sep)
# Ensure folder_path is absolute for consistent relative path calculations
abs_folder_path = Path(folder_path).resolve()
for root, dirs, files in os.walk(abs_folder_path, topdown=True):
abs_root_path = Path(root).resolve()
# --- Directory Ignore Logic ---
# Filter directories *before* recursing into them
# Keep track of original dirs list to modify dirs[:]
original_dirs = list(dirs)
dirs[:] = [] # Clear dirs list, we will re-add ones we want to keep
for d in original_dirs:
dir_path = abs_root_path / d
# Skip .git directory explicitly (essential)
if d == ".git":
if self.verbose > 1: print(f"Skipping .git directory: {dir_path}")
continue
if self._is_path_ignored(str(dir_path)):
if self.verbose:
print(f"Ignoring directory: {dir_path.relative_to(self.input_path)}")
# Don't add 'd' back to dirs[:], effectively pruning the walk
else:
dirs.append(d) # Keep this directory for recursion
# --- Calculate Tree Indentation ---
try:
# Calculate level relative to the *initial* input path for correct indentation
rel_root = abs_root_path.relative_to(self.input_path)
level = len(rel_root.parts) if rel_root.parts != ('.',) else 0
except ValueError:
# Should not happen if os.walk starts within input_path
print(f"Warning: Cannot determine relative path for {abs_root_path}. Using level 0.")
level = 0
indent = ' ' * level # 4 spaces per level
# Add directory entry to tree (only if it's not the root itself processed initially)
if abs_root_path != Path(self.input_path).resolve(): # Don't print root '/'
tree += f"{indent}{abs_root_path.name}/\n"
elif level == 0 and not tree: # Print root marker only once at the start
tree += f"{Path(self.input_path).name}/\n"
# --- File Listing ---
subindent = ' ' * (level + 1)
sorted_files = sorted(files) # Sort files for consistent output
for f in sorted_files:
file_path = abs_root_path / f
# Check if file is ignored
if not self._is_path_ignored(str(file_path)):
tree += f"{subindent}{f}\n"
elif self.verbose:
# Note: _is_path_ignored already prints detailed reasons if verbose > 1
print(f"Ignoring file (in tree): {file_path.relative_to(self.input_path)}")
if self.verbose:
print(f"\n--- Generated File Tree ---\n{tree}")
print("--- End File Tree ---\n")
return tree
def _get_file_contents(self, file_path):
"""Reads file content, handling potential encoding issues."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
try:
# Try a fallback encoding (e.g., latin-1 or detected encoding)
with open(file_path, 'r', encoding='latin-1') as file:
if self.verbose: print(f"Warning: Used fallback encoding 'latin-1' for {file_path}")
return file.read()
except Exception as e:
print(f"Error: Could not read file {file_path} with utf-8 or latin-1: {e}")
return f"Error reading file: {e}" # Include error message in output
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return f"Error reading file: {e}"
def _process_files(self, path_to_walk):
"""Walks through files and concatenates their content, respecting ignores."""
content = ""
base_path = Path(self.input_path).resolve() # Use resolved base path
for root, dirs, files in os.walk(path_to_walk, topdown=True):
abs_root_path = Path(root).resolve()
# --- Directory Pruning (same logic as in _parse_folder) ---
original_dirs = list(dirs)
dirs[:] = []
for d in original_dirs:
dir_path = abs_root_path / d
if d == ".git" or self._is_path_ignored(str(dir_path)):
continue # Skip ignored or .git dirs
else:
dirs.append(d)
# --- Process Files in Current Directory ---
sorted_files = sorted(files)
for file_name in sorted_files:
file_path = abs_root_path / file_name
str_file_path = str(file_path)
# Skip ignored files
if self._is_path_ignored(str_file_path):
if self.verbose:
print(f"Ignoring file (content): {file_path.relative_to(base_path)}")
continue
# Try to get content
try:
if self.verbose:
print(f"Processing: {file_path.relative_to(base_path)}")
file_content = self._get_file_contents(str_file_path)
rel_file_path_display = file_path.relative_to(base_path).as_posix() # Display relative path
content += f"\n\n--- File: {rel_file_path_display} ---\n"
# Optional: Add file type hint
# content += f"File type: {os.path.splitext(file_name)[1]}\n\n"
content += file_content
# Use a clear end marker
content += f"\n--- End File: {rel_file_path_display} ---\n"
except Exception as e: # Catch potential errors during processing
print(f"Couldn't process {file_path.relative_to(base_path)}: {e}")
content += f"\n\n--- Error processing file: {file_path.relative_to(base_path)} --- \n {e} \n--- End Error ---\n"
return content
def get_text(self):
"""Generates the final combined text output."""
# --- Decide whether to clone or use local path ---
process_path = self.input_path # Default to local path
if self.is_github_repo():
success = self._clone_github_repo()
if success:
process_path = self.temp_folder_path
self.is_cloned_repo = True
# Re-initialize ignores for the cloned repo location
self.input_path = process_path # Temporarily change base for ignore checks
self._initialize_ignores()
print(f"Processing cloned repo at: {process_path}")
else:
print("Error: Failed to clone GitHub repository. Aborting.")
# Reset input_path if cloning failed and we modified it
if self.is_cloned_repo: self.input_path = os.path.dirname(self.temp_folder_path) # Hacky way to get original path back conceptually
return "Error: Could not clone repository." # Return error message
else:
print(f"Processing local path: {process_path}")
# Ensure ignores are initialized for the local path (done in __init__)
# --- Generate Structure and Content ---
folder_structure = self._parse_folder(process_path)
file_contents = self._process_files(process_path)
# --- Assemble Final Output ---
folder_structure_header = "--- Folder Structure ---"
file_contents_header = "--- File Contents ---"
delimiter = "=" * 60 # Use a more prominent delimiter
# Restore original input_path if it was changed for cloning
if self.is_cloned_repo:
# This assumes the original input_path wasn't needed after _initialize_ignores
# A cleaner way might be to pass the base path explicitly to ignore checkers
pass # No need to restore if input_path wasn't critical after cloning
return (
f"{folder_structure_header}\n{folder_structure}\n{delimiter}\n\n"
f"{file_contents_header}\n{file_contents}\n{delimiter}\nEnd of Codebase\n{delimiter}"
)
def get_file(self):
"""Gets the text and saves it to the specified output file."""
text_content = self.get_text()
# Check for error during get_text (e.g., cloning failure)
if text_content.startswith("Error:"):
print(text_content) # Print the error
# Optionally, clean up temp folder even on error
self.clean_up_temp_folder()
return # Exit without writing file
try:
# Ensure output directory exists
output_dir = os.path.dirname(self.output_path)
if output_dir: # Handle case where output is in current dir
os.makedirs(output_dir, exist_ok=True)
if self.output_type == "txt":
with open(self.output_path, "w", encoding='utf-8') as file:
file.write(text_content)
elif self.output_type == "docx":
doc = Document()
# Add text respecting paragraphs (simple split, might need refinement)
# Consider adding as preformatted text run if python-docx supports it well
for paragraph in text_content.split('\n'):
doc.add_paragraph(paragraph)
doc.save(self.output_path)
else:
# Should be caught by argparse choices usually, but good to have
raise ValueError(f"Invalid output type '{self.output_type}'. Supported types: txt, docx")
print(f"\nSuccessfully generated {self.output_type} file: {self.output_path}")
except Exception as e:
print(f"\nError writing output file {self.output_path}: {e}")
finally:
# Clean up temp folder regardless of writing success/failure
self.clean_up_temp_folder()
#### GitHub ####
def _clone_github_repo(self):
"""Clones the repo to a temporary directory."""
try:
# Create temp dir *before* cloning into it
# Use a more descriptive prefix/suffix if desired
self.temp_folder_path = tempfile.mkdtemp(prefix="cbt_repo_")
print(f"Cloning {self.input_path} into temporary folder {self.temp_folder_path}...")
git.Repo.clone_from(self.input_path, self.temp_folder_path)
# Important: Update self.input_path to the temp folder *for processing*
# self.input_path = self.temp_folder_path # Now done within get_text
if self.verbose:
print("GitHub repository cloned successfully.")
return True # Indicate success
except git.GitCommandError as e:
print(f"Error cloning GitHub repository: {e}")
# Clean up failed clone attempt
self.clean_up_temp_folder() # Ensure cleanup even on clone failure
self.temp_folder_path = None # Reset path
return False # Indicate failure
except Exception as e:
print(f"An unexpected error occurred during cloning: {e}")
self.clean_up_temp_folder() # Ensure cleanup
self.temp_folder_path = None # Reset path
return False # Indicate failure
def is_github_repo(self):
"""Checks if the input path looks like a common Git repo URL."""
# Keep it simple, add more patterns if needed
return self.input_path.startswith(("https://github.com/", "git@github.com:", "https://gitlab.com/", "git@gitlab.com:", "https://bitbucket.org/", "git@bitbucket.org:")) \
or self.input_path.endswith(".git") # Common convention for clone URLs
def clean_up_temp_folder(self):
"""Removes the temporary folder if it was created."""
if self.temp_folder_path and os.path.exists(self.temp_folder_path):
try:
shutil.rmtree(self.temp_folder_path)
if self.verbose:
print(f"Cleaned up temporary folder: {self.temp_folder_path}")
self.temp_folder_path = None # Reset path after successful removal
except Exception as e:
print(f"Warning: Could not remove temporary folder {self.temp_folder_path}: {e}")
# --- Main Execution ---
def main():
parser = argparse.ArgumentParser(
description="Generate a single text or docx file from a codebase, respecting .gitignore and custom ignore rules.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter # Show defaults
)
parser.add_argument(
"input",
help="Input path (local folder path or Git repository URL)."
)
parser.add_argument(
"-o", "--output",
required=True,
help="Output file path (e.g., my_codebase.txt or output/report.docx)."
)
parser.add_argument(
"-t", "--output_type",
choices=["txt", "docx"],
default="txt",
help="Output file type."
)
parser.add_argument(
"--ignore",
nargs='*', default=[], # Accept zero or more ignore patterns
help="List of additional paths/patterns to ignore (e.g., 'dist/' '/node_modules' '*.log' 'config. Maches relative to the input path root."
)
parser.add_argument(
"--exclude_hidden",
action="store_true",
help="Exclude files and folders starting with '.' or '__'. Note that .gitignore or custom ignores might already cover these."
)
parser.add_argument(
"-v", "--verbose",
action="count", default=0, # Use count for verbosity levels (0, 1, 2)
help="Increase output verbosity. -v for basic info, -vv for detailed ignore reasons."
)
args = parser.parse_args()
# Basic validation
if not args.input:
parser.error("Input path cannot be empty.")
if not args.output:
parser.error("Output path cannot be empty.")
code_to_text = None # Ensure it's defined for finally block
try:
code_to_text = CodebaseToText(
input_path=args.input,
output_path=args.output,
output_type=args.output_type,
verbose=args.verbose,
exclude_hidden=args.exclude_hidden,
ignored_paths=args.ignore # Pass the list here
)
code_to_text.get_file()
except Exception as e:
print(f"\nAn unexpected error occurred: {e}")
# Attempt cleanup even if initialization failed partially
if code_to_text:
code_to_text.clean_up_temp_folder()
# No finally block needed here as get_file() now handles cleanup
def _sql_dump(database, export_file):
command = [
r'd:\xampp\mysql\bin\mysqldump.exe',
'-u', 'root',
'--no-data',
database
]
with open(export_file, 'w') as output_file:
result = subprocess.run(command, stdout=output_file, stderr=subprocess.PIPE)
if(result.returncode == 0):
print(f"Schema dump successful: {export_file}")
else:
print("Error occurred:", result.stderr.decode())
if __name__ == "__main__":
# --- Example Usage (replace with main() for CLI) ---
# To run from command line, save the script (e.g., codebase_to_text.py) and run:
# python codebase_to_text.py . -o my_project.txt --ignore "dist/" "*.tmp" "/tests/data/" --exclude_hidden -v
# python codebase_to_text.py https://github.com/user/repo.git -o repo_code.docx -t docx -vv
# --- Direct call example (useful for testing) ---
try:
print("Running direct example...")
# Example: Process current directory, output to output.txt, ignore 'venv' folder and all '.log' files
example_ignores = ["venv/", "*.log", "/output.txt", ".git/", "__pycache__/"] # Add common ignores
example_ignores.append("public/css/")
example_ignores.append("codebase-to-text.py")
converter = CodebaseToText(
input_path=".",
output_path="_codebase_output.txt",
output_type="txt",
verbose=0, # Set verbosity level (0, 1, or 2)
exclude_hidden=True,
ignored_paths=example_ignores
)
converter.get_file()
print("Direct example finished.")
_sql_dump('tp_servicedesk', '_codebase_schemafile.sql')
except Exception as e:
print(f"Error running direct example: {e}")
# Uncomment the line below to enable command-line argument parsing when running the script directly
# main()
# to get a sql dump use somthing similar to
# d:\xampp\mysql\bin\mysqldump.exe -u root --no-data tp_servicedesk > _codebase_schemafile.sql