488 lines
22 KiB
Python
488 lines
22 KiB
Python
import os
|
|
import argparse
|
|
import subprocess
|
|
import git # Still needed for potential future use or for checking if it's a repo, even if not cloning
|
|
import shutil
|
|
import tempfile
|
|
from pathlib import Path
|
|
from docx import Document
|
|
from pathspec import PathSpec
|
|
from pathspec.patterns import GitWildMatchPattern # Explicit import often good practice
|
|
|
|
class CodebaseToText:
|
|
def __init__(self, input_path, output_path, output_type, verbose, exclude_hidden, ignored_paths=None):
|
|
# Normalize input path early
|
|
self.input_path = os.path.abspath(input_path) # Use absolute path for consistency
|
|
self.output_path = output_path
|
|
self.output_type = output_type
|
|
self.verbose = verbose
|
|
self.exclude_hidden = exclude_hidden
|
|
self.ignored_paths = ignored_paths if ignored_paths else [] # Store custom ignores
|
|
|
|
self.temp_folder_path = None # Used only if cloning
|
|
self.is_cloned_repo = False # Flag to track if we cloned
|
|
|
|
self.git_ignore_spec = None
|
|
self.custom_ignore_spec = None
|
|
|
|
self._initialize_ignores() # Load ignores after setting input_path
|
|
|
|
def _initialize_ignores(self):
|
|
"""Loads .gitignore and initializes custom ignore spec."""
|
|
# Load .gitignore relative to the current input_path
|
|
gitignore_path = os.path.join(self.input_path, ".gitignore")
|
|
if os.path.exists(gitignore_path):
|
|
try:
|
|
with open(gitignore_path, 'r', encoding='utf-8') as f: # Specify encoding
|
|
lines = f.read().splitlines()
|
|
# Filter out empty lines and comments
|
|
lines = [line for line in lines if line.strip() and not line.strip().startswith('#')]
|
|
if lines:
|
|
self.git_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, lines)
|
|
if self.verbose:
|
|
print(f"Loaded .gitignore rules from: {gitignore_path}")
|
|
except Exception as e:
|
|
print(f"Warning: Could not read .gitignore file at {gitignore_path}: {e}")
|
|
elif self.verbose:
|
|
print(f"No .gitignore file found at: {gitignore_path}")
|
|
|
|
# Create PathSpec for custom ignored paths
|
|
if self.ignored_paths:
|
|
# Filter out empty lines/patterns just in case
|
|
valid_custom_paths = [p for p in self.ignored_paths if p.strip()]
|
|
if valid_custom_paths:
|
|
self.custom_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, valid_custom_paths)
|
|
if self.verbose:
|
|
print(f"Using custom ignore rules: {valid_custom_paths}")
|
|
else:
|
|
self.ignored_paths = [] # Clear if only contained empty strings
|
|
|
|
|
|
def _is_path_ignored(self, file_or_dir_path):
|
|
"""Checks if a given path should be ignored based on all rules."""
|
|
try:
|
|
# Calculate relative path from the project root (self.input_path)
|
|
# Use pathlib for robustness
|
|
base_path = Path(self.input_path)
|
|
target_path = Path(file_or_dir_path)
|
|
# Use absolute paths temporarily to ensure correct relative calculation
|
|
rel_path = target_path.relative_to(base_path).as_posix() # Use POSIX paths for pathspec
|
|
except ValueError:
|
|
# If the path is not relative to input_path (shouldn't normally happen with os.walk)
|
|
if self.verbose:
|
|
print(f"Warning: Path {file_or_dir_path} is not relative to {self.input_path}. Skipping ignore checks for it.")
|
|
return False # Or decide how to handle this case
|
|
|
|
# Check .gitignore rules
|
|
if self.git_ignore_spec and self.git_ignore_spec.match_file(rel_path):
|
|
if self.verbose > 1: # More detailed verbose logging if needed
|
|
print(f"Ignoring '{rel_path}' (gitignore)")
|
|
return True
|
|
|
|
# Check custom ignore rules
|
|
if self.custom_ignore_spec and self.custom_ignore_spec.match_file(rel_path):
|
|
if self.verbose > 1:
|
|
print(f"Ignoring '{rel_path}' (custom)")
|
|
return True
|
|
|
|
# Check if hidden files/dirs should be excluded
|
|
# Note: PathSpec patterns can also match hidden files (e.g., '.*'),
|
|
# so this check is primarily for the simple dot/underscore prefix rule.
|
|
if self.exclude_hidden and self._is_hidden_path_component(target_path):
|
|
if self.verbose > 1:
|
|
print(f"Ignoring '{rel_path}' (hidden)")
|
|
return True
|
|
|
|
return False
|
|
|
|
def _is_hidden_path_component(self, path_obj: Path):
|
|
"""Checks if any component of the path starts with '.' or '__'."""
|
|
# Check the name itself and its parents relative to the base input path
|
|
relative_parts = path_obj.relative_to(self.input_path).parts
|
|
return any(part.startswith(('.', '__')) for part in relative_parts if part != '.')
|
|
|
|
|
|
def _parse_folder(self, folder_path):
|
|
"""Generates the directory tree string, respecting ignore rules."""
|
|
tree = ""
|
|
base_level = folder_path.count(os.sep)
|
|
|
|
# Ensure folder_path is absolute for consistent relative path calculations
|
|
abs_folder_path = Path(folder_path).resolve()
|
|
|
|
for root, dirs, files in os.walk(abs_folder_path, topdown=True):
|
|
abs_root_path = Path(root).resolve()
|
|
|
|
# --- Directory Ignore Logic ---
|
|
# Filter directories *before* recursing into them
|
|
# Keep track of original dirs list to modify dirs[:]
|
|
original_dirs = list(dirs)
|
|
dirs[:] = [] # Clear dirs list, we will re-add ones we want to keep
|
|
|
|
for d in original_dirs:
|
|
dir_path = abs_root_path / d
|
|
# Skip .git directory explicitly (essential)
|
|
if d == ".git":
|
|
if self.verbose > 1: print(f"Skipping .git directory: {dir_path}")
|
|
continue
|
|
|
|
if self._is_path_ignored(str(dir_path)):
|
|
if self.verbose:
|
|
print(f"Ignoring directory: {dir_path.relative_to(self.input_path)}")
|
|
# Don't add 'd' back to dirs[:], effectively pruning the walk
|
|
else:
|
|
dirs.append(d) # Keep this directory for recursion
|
|
|
|
# --- Calculate Tree Indentation ---
|
|
try:
|
|
# Calculate level relative to the *initial* input path for correct indentation
|
|
rel_root = abs_root_path.relative_to(self.input_path)
|
|
level = len(rel_root.parts) if rel_root.parts != ('.',) else 0
|
|
except ValueError:
|
|
# Should not happen if os.walk starts within input_path
|
|
print(f"Warning: Cannot determine relative path for {abs_root_path}. Using level 0.")
|
|
level = 0
|
|
|
|
indent = ' ' * level # 4 spaces per level
|
|
# Add directory entry to tree (only if it's not the root itself processed initially)
|
|
if abs_root_path != Path(self.input_path).resolve(): # Don't print root '/'
|
|
tree += f"{indent}{abs_root_path.name}/\n"
|
|
elif level == 0 and not tree: # Print root marker only once at the start
|
|
tree += f"{Path(self.input_path).name}/\n"
|
|
|
|
|
|
# --- File Listing ---
|
|
subindent = ' ' * (level + 1)
|
|
sorted_files = sorted(files) # Sort files for consistent output
|
|
|
|
for f in sorted_files:
|
|
file_path = abs_root_path / f
|
|
# Check if file is ignored
|
|
if not self._is_path_ignored(str(file_path)):
|
|
tree += f"{subindent}{f}\n"
|
|
elif self.verbose:
|
|
# Note: _is_path_ignored already prints detailed reasons if verbose > 1
|
|
print(f"Ignoring file (in tree): {file_path.relative_to(self.input_path)}")
|
|
|
|
|
|
if self.verbose:
|
|
print(f"\n--- Generated File Tree ---\n{tree}")
|
|
print("--- End File Tree ---\n")
|
|
|
|
return tree
|
|
|
|
def _get_file_contents(self, file_path):
|
|
"""Reads file content, handling potential encoding issues."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
return file.read()
|
|
except UnicodeDecodeError:
|
|
try:
|
|
# Try a fallback encoding (e.g., latin-1 or detected encoding)
|
|
with open(file_path, 'r', encoding='latin-1') as file:
|
|
if self.verbose: print(f"Warning: Used fallback encoding 'latin-1' for {file_path}")
|
|
return file.read()
|
|
except Exception as e:
|
|
print(f"Error: Could not read file {file_path} with utf-8 or latin-1: {e}")
|
|
return f"Error reading file: {e}" # Include error message in output
|
|
except Exception as e:
|
|
print(f"Error reading file {file_path}: {e}")
|
|
return f"Error reading file: {e}"
|
|
|
|
|
|
def _process_files(self, path_to_walk):
|
|
"""Walks through files and concatenates their content, respecting ignores."""
|
|
content = ""
|
|
base_path = Path(self.input_path).resolve() # Use resolved base path
|
|
|
|
for root, dirs, files in os.walk(path_to_walk, topdown=True):
|
|
abs_root_path = Path(root).resolve()
|
|
|
|
# --- Directory Pruning (same logic as in _parse_folder) ---
|
|
original_dirs = list(dirs)
|
|
dirs[:] = []
|
|
for d in original_dirs:
|
|
dir_path = abs_root_path / d
|
|
if d == ".git" or self._is_path_ignored(str(dir_path)):
|
|
continue # Skip ignored or .git dirs
|
|
else:
|
|
dirs.append(d)
|
|
|
|
# --- Process Files in Current Directory ---
|
|
sorted_files = sorted(files)
|
|
for file_name in sorted_files:
|
|
file_path = abs_root_path / file_name
|
|
str_file_path = str(file_path)
|
|
|
|
# Skip ignored files
|
|
if self._is_path_ignored(str_file_path):
|
|
if self.verbose:
|
|
print(f"Ignoring file (content): {file_path.relative_to(base_path)}")
|
|
continue
|
|
|
|
# Try to get content
|
|
try:
|
|
if self.verbose:
|
|
print(f"Processing: {file_path.relative_to(base_path)}")
|
|
|
|
file_content = self._get_file_contents(str_file_path)
|
|
rel_file_path_display = file_path.relative_to(base_path).as_posix() # Display relative path
|
|
|
|
content += f"\n\n--- File: {rel_file_path_display} ---\n"
|
|
# Optional: Add file type hint
|
|
# content += f"File type: {os.path.splitext(file_name)[1]}\n\n"
|
|
content += file_content
|
|
# Use a clear end marker
|
|
content += f"\n--- End File: {rel_file_path_display} ---\n"
|
|
|
|
except Exception as e: # Catch potential errors during processing
|
|
print(f"Couldn't process {file_path.relative_to(base_path)}: {e}")
|
|
content += f"\n\n--- Error processing file: {file_path.relative_to(base_path)} --- \n {e} \n--- End Error ---\n"
|
|
|
|
return content
|
|
|
|
def get_text(self):
|
|
"""Generates the final combined text output."""
|
|
# --- Decide whether to clone or use local path ---
|
|
process_path = self.input_path # Default to local path
|
|
if self.is_github_repo():
|
|
success = self._clone_github_repo()
|
|
if success:
|
|
process_path = self.temp_folder_path
|
|
self.is_cloned_repo = True
|
|
# Re-initialize ignores for the cloned repo location
|
|
self.input_path = process_path # Temporarily change base for ignore checks
|
|
self._initialize_ignores()
|
|
print(f"Processing cloned repo at: {process_path}")
|
|
else:
|
|
print("Error: Failed to clone GitHub repository. Aborting.")
|
|
# Reset input_path if cloning failed and we modified it
|
|
if self.is_cloned_repo: self.input_path = os.path.dirname(self.temp_folder_path) # Hacky way to get original path back conceptually
|
|
return "Error: Could not clone repository." # Return error message
|
|
else:
|
|
print(f"Processing local path: {process_path}")
|
|
# Ensure ignores are initialized for the local path (done in __init__)
|
|
|
|
|
|
# --- Generate Structure and Content ---
|
|
folder_structure = self._parse_folder(process_path)
|
|
file_contents = self._process_files(process_path)
|
|
|
|
# --- Assemble Final Output ---
|
|
folder_structure_header = "--- Folder Structure ---"
|
|
file_contents_header = "--- File Contents ---"
|
|
delimiter = "=" * 60 # Use a more prominent delimiter
|
|
|
|
# Restore original input_path if it was changed for cloning
|
|
if self.is_cloned_repo:
|
|
# This assumes the original input_path wasn't needed after _initialize_ignores
|
|
# A cleaner way might be to pass the base path explicitly to ignore checkers
|
|
pass # No need to restore if input_path wasn't critical after cloning
|
|
|
|
return (
|
|
f"{folder_structure_header}\n{folder_structure}\n{delimiter}\n\n"
|
|
f"{file_contents_header}\n{file_contents}\n{delimiter}\nEnd of Codebase\n{delimiter}"
|
|
)
|
|
|
|
def get_file(self):
|
|
"""Gets the text and saves it to the specified output file."""
|
|
text_content = self.get_text()
|
|
|
|
# Check for error during get_text (e.g., cloning failure)
|
|
if text_content.startswith("Error:"):
|
|
print(text_content) # Print the error
|
|
# Optionally, clean up temp folder even on error
|
|
self.clean_up_temp_folder()
|
|
return # Exit without writing file
|
|
|
|
try:
|
|
# Ensure output directory exists
|
|
output_dir = os.path.dirname(self.output_path)
|
|
if output_dir: # Handle case where output is in current dir
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
if self.output_type == "txt":
|
|
with open(self.output_path, "w", encoding='utf-8') as file:
|
|
file.write(text_content)
|
|
elif self.output_type == "docx":
|
|
doc = Document()
|
|
# Add text respecting paragraphs (simple split, might need refinement)
|
|
# Consider adding as preformatted text run if python-docx supports it well
|
|
for paragraph in text_content.split('\n'):
|
|
doc.add_paragraph(paragraph)
|
|
doc.save(self.output_path)
|
|
else:
|
|
# Should be caught by argparse choices usually, but good to have
|
|
raise ValueError(f"Invalid output type '{self.output_type}'. Supported types: txt, docx")
|
|
|
|
print(f"\nSuccessfully generated {self.output_type} file: {self.output_path}")
|
|
|
|
except Exception as e:
|
|
print(f"\nError writing output file {self.output_path}: {e}")
|
|
|
|
finally:
|
|
# Clean up temp folder regardless of writing success/failure
|
|
self.clean_up_temp_folder()
|
|
|
|
|
|
#### GitHub ####
|
|
def _clone_github_repo(self):
|
|
"""Clones the repo to a temporary directory."""
|
|
try:
|
|
# Create temp dir *before* cloning into it
|
|
# Use a more descriptive prefix/suffix if desired
|
|
self.temp_folder_path = tempfile.mkdtemp(prefix="cbt_repo_")
|
|
print(f"Cloning {self.input_path} into temporary folder {self.temp_folder_path}...")
|
|
git.Repo.clone_from(self.input_path, self.temp_folder_path)
|
|
|
|
# Important: Update self.input_path to the temp folder *for processing*
|
|
# self.input_path = self.temp_folder_path # Now done within get_text
|
|
if self.verbose:
|
|
print("GitHub repository cloned successfully.")
|
|
return True # Indicate success
|
|
except git.GitCommandError as e:
|
|
print(f"Error cloning GitHub repository: {e}")
|
|
# Clean up failed clone attempt
|
|
self.clean_up_temp_folder() # Ensure cleanup even on clone failure
|
|
self.temp_folder_path = None # Reset path
|
|
return False # Indicate failure
|
|
except Exception as e:
|
|
print(f"An unexpected error occurred during cloning: {e}")
|
|
self.clean_up_temp_folder() # Ensure cleanup
|
|
self.temp_folder_path = None # Reset path
|
|
return False # Indicate failure
|
|
|
|
def is_github_repo(self):
|
|
"""Checks if the input path looks like a common Git repo URL."""
|
|
# Keep it simple, add more patterns if needed
|
|
return self.input_path.startswith(("https://github.com/", "git@github.com:", "https://gitlab.com/", "git@gitlab.com:", "https://bitbucket.org/", "git@bitbucket.org:")) \
|
|
or self.input_path.endswith(".git") # Common convention for clone URLs
|
|
|
|
def clean_up_temp_folder(self):
|
|
"""Removes the temporary folder if it was created."""
|
|
if self.temp_folder_path and os.path.exists(self.temp_folder_path):
|
|
try:
|
|
shutil.rmtree(self.temp_folder_path)
|
|
if self.verbose:
|
|
print(f"Cleaned up temporary folder: {self.temp_folder_path}")
|
|
self.temp_folder_path = None # Reset path after successful removal
|
|
except Exception as e:
|
|
print(f"Warning: Could not remove temporary folder {self.temp_folder_path}: {e}")
|
|
|
|
# --- Main Execution ---
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate a single text or docx file from a codebase, respecting .gitignore and custom ignore rules.",
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter # Show defaults
|
|
)
|
|
parser.add_argument(
|
|
"input",
|
|
help="Input path (local folder path or Git repository URL)."
|
|
)
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
required=True,
|
|
help="Output file path (e.g., my_codebase.txt or output/report.docx)."
|
|
)
|
|
parser.add_argument(
|
|
"-t", "--output_type",
|
|
choices=["txt", "docx"],
|
|
default="txt",
|
|
help="Output file type."
|
|
)
|
|
parser.add_argument(
|
|
"--ignore",
|
|
nargs='*', default=[], # Accept zero or more ignore patterns
|
|
help="List of additional paths/patterns to ignore (e.g., 'dist/' '/node_modules' '*.log' 'config. Maches relative to the input path root."
|
|
)
|
|
parser.add_argument(
|
|
"--exclude_hidden",
|
|
action="store_true",
|
|
help="Exclude files and folders starting with '.' or '__'. Note that .gitignore or custom ignores might already cover these."
|
|
)
|
|
parser.add_argument(
|
|
"-v", "--verbose",
|
|
action="count", default=0, # Use count for verbosity levels (0, 1, 2)
|
|
help="Increase output verbosity. -v for basic info, -vv for detailed ignore reasons."
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Basic validation
|
|
if not args.input:
|
|
parser.error("Input path cannot be empty.")
|
|
if not args.output:
|
|
parser.error("Output path cannot be empty.")
|
|
|
|
|
|
code_to_text = None # Ensure it's defined for finally block
|
|
try:
|
|
code_to_text = CodebaseToText(
|
|
input_path=args.input,
|
|
output_path=args.output,
|
|
output_type=args.output_type,
|
|
verbose=args.verbose,
|
|
exclude_hidden=args.exclude_hidden,
|
|
ignored_paths=args.ignore # Pass the list here
|
|
)
|
|
code_to_text.get_file()
|
|
|
|
except Exception as e:
|
|
print(f"\nAn unexpected error occurred: {e}")
|
|
# Attempt cleanup even if initialization failed partially
|
|
if code_to_text:
|
|
code_to_text.clean_up_temp_folder()
|
|
# No finally block needed here as get_file() now handles cleanup
|
|
|
|
def _sql_dump(database, export_file):
|
|
command = [
|
|
r'd:\xampp\mysql\bin\mysqldump.exe',
|
|
'-u', 'root',
|
|
'--no-data',
|
|
database
|
|
]
|
|
|
|
with open(export_file, 'w') as output_file:
|
|
result = subprocess.run(command, stdout=output_file, stderr=subprocess.PIPE)
|
|
|
|
if(result.returncode == 0):
|
|
print(f"Schema dump successful: {export_file}")
|
|
else:
|
|
print("Error occurred:", result.stderr.decode())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# --- Example Usage (replace with main() for CLI) ---
|
|
|
|
# To run from command line, save the script (e.g., codebase_to_text.py) and run:
|
|
# python codebase_to_text.py . -o my_project.txt --ignore "dist/" "*.tmp" "/tests/data/" --exclude_hidden -v
|
|
# python codebase_to_text.py https://github.com/user/repo.git -o repo_code.docx -t docx -vv
|
|
|
|
# --- Direct call example (useful for testing) ---
|
|
try:
|
|
print("Running direct example...")
|
|
# Example: Process current directory, output to output.txt, ignore 'venv' folder and all '.log' files
|
|
example_ignores = ["venv/", "*.log", "/output.txt", ".git/", "__pycache__/"] # Add common ignores
|
|
example_ignores.append("public/css/")
|
|
example_ignores.append("codebase-to-text.py")
|
|
converter = CodebaseToText(
|
|
input_path=".",
|
|
output_path="_codebase_output.txt",
|
|
output_type="txt",
|
|
verbose=0, # Set verbosity level (0, 1, or 2)
|
|
exclude_hidden=True,
|
|
ignored_paths=example_ignores
|
|
)
|
|
converter.get_file()
|
|
print("Direct example finished.")
|
|
|
|
_sql_dump('tp_servicedesk', '_codebase_schemafile.sql')
|
|
except Exception as e:
|
|
print(f"Error running direct example: {e}")
|
|
|
|
# Uncomment the line below to enable command-line argument parsing when running the script directly
|
|
# main()
|
|
|
|
# to get a sql dump use somthing similar to
|
|
# d:\xampp\mysql\bin\mysqldump.exe -u root --no-data tp_servicedesk > _codebase_schemafile.sql
|
|
|