tp_servicedesk/codebase-to-text.py

import os
import argparse
import subprocess
import git # Still needed for potential future use or for checking if it's a repo, even if not cloning
import shutil
import tempfile
from pathlib import Path
from docx import Document
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern # Explicit import often good practice

class CodebaseToText:
    def __init__(self, input_path, output_path, output_type, verbose, exclude_hidden, ignored_paths=None):
        # Normalize input path early
        self.input_path = os.path.abspath(input_path) # Use absolute path for consistency
        self.output_path = output_path
        self.output_type = output_type
        self.verbose = verbose
        self.exclude_hidden = exclude_hidden
        self.ignored_paths = ignored_paths if ignored_paths else [] # Store custom ignores

        self.temp_folder_path = None # Used only if cloning
        self.is_cloned_repo = False # Flag to track if we cloned

        self.git_ignore_spec = None
        self.custom_ignore_spec = None

        self._initialize_ignores() # Load ignores after setting input_path

    def _initialize_ignores(self):
        """Loads .gitignore and initializes custom ignore spec."""
        # Load .gitignore relative to the current input_path
        gitignore_path = os.path.join(self.input_path, ".gitignore")
        if os.path.exists(gitignore_path):
            try:
                with open(gitignore_path, 'r', encoding='utf-8') as f: # Specify encoding
                    lines = f.read().splitlines()
                    # Filter out empty lines and comments
                    lines = [line for line in lines if line.strip() and not line.strip().startswith('#')]
                    if lines:
                        self.git_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, lines)
                        if self.verbose:
                            print(f"Loaded .gitignore rules from: {gitignore_path}")
            except Exception as e:
                print(f"Warning: Could not read .gitignore file at {gitignore_path}: {e}")
        elif self.verbose:
            print(f"No .gitignore file found at: {gitignore_path}")

        # Create PathSpec for custom ignored paths
        if self.ignored_paths:
             # Filter out empty lines/patterns just in case
            valid_custom_paths = [p for p in self.ignored_paths if p.strip()]
            if valid_custom_paths:
                self.custom_ignore_spec = PathSpec.from_lines(GitWildMatchPattern, valid_custom_paths)
                if self.verbose:
                    print(f"Using custom ignore rules: {valid_custom_paths}")
            else:
                self.ignored_paths = [] # Clear if only contained empty strings


    def _is_path_ignored(self, file_or_dir_path):
        """Checks if a given path should be ignored based on all rules."""
        try:
            # Calculate relative path from the project root (self.input_path)
            # Use pathlib for robustness
            base_path = Path(self.input_path)
            target_path = Path(file_or_dir_path)
            # Use absolute paths temporarily to ensure correct relative calculation
            rel_path = target_path.relative_to(base_path).as_posix() # Use POSIX paths for pathspec
        except ValueError:
             # If the path is not relative to input_path (shouldn't normally happen with os.walk)
             if self.verbose:
                 print(f"Warning: Path {file_or_dir_path} is not relative to {self.input_path}. Skipping ignore checks for it.")
             return False # Or decide how to handle this case

        # Check .gitignore rules
        if self.git_ignore_spec and self.git_ignore_spec.match_file(rel_path):
            if self.verbose > 1: # More detailed verbose logging if needed
                 print(f"Ignoring '{rel_path}' (gitignore)")
            return True

        # Check custom ignore rules
        if self.custom_ignore_spec and self.custom_ignore_spec.match_file(rel_path):
            if self.verbose > 1:
                 print(f"Ignoring '{rel_path}' (custom)")
            return True

        # Check if hidden files/dirs should be excluded
        # Note: PathSpec patterns can also match hidden files (e.g., '.*'),
        # so this check is primarily for the simple dot/underscore prefix rule.
        if self.exclude_hidden and self._is_hidden_path_component(target_path):
            if self.verbose > 1:
                 print(f"Ignoring '{rel_path}' (hidden)")
            return True

        return False

    def _is_hidden_path_component(self, path_obj: Path):
        """Checks if any component of the path starts with '.' or '__'."""
        # Check the name itself and its parents relative to the base input path
        relative_parts = path_obj.relative_to(self.input_path).parts
        return any(part.startswith(('.', '__')) for part in relative_parts if part != '.')


    def _parse_folder(self, folder_path):
        """Generates the directory tree string, respecting ignore rules."""
        tree = ""
        base_level = folder_path.count(os.sep)

        # Ensure folder_path is absolute for consistent relative path calculations
        abs_folder_path = Path(folder_path).resolve()

        for root, dirs, files in os.walk(abs_folder_path, topdown=True):
            abs_root_path = Path(root).resolve()

            # --- Directory Ignore Logic ---
            # Filter directories *before* recursing into them
            # Keep track of original dirs list to modify dirs[:]
            original_dirs = list(dirs)
            dirs[:] = [] # Clear dirs list, we will re-add ones we want to keep

            for d in original_dirs:
                dir_path = abs_root_path / d
                # Skip .git directory explicitly (essential)
                if d == ".git":
                      if self.verbose > 1: print(f"Skipping .git directory: {dir_path}")
                      continue

                if self._is_path_ignored(str(dir_path)):
                    if self.verbose:
                        print(f"Ignoring directory: {dir_path.relative_to(self.input_path)}")
                    # Don't add 'd' back to dirs[:], effectively pruning the walk
                else:
                     dirs.append(d) # Keep this directory for recursion

            # --- Calculate Tree Indentation ---
            try:
                # Calculate level relative to the *initial* input path for correct indentation
                rel_root = abs_root_path.relative_to(self.input_path)
                level = len(rel_root.parts) if rel_root.parts != ('.',) else 0
            except ValueError:
                 # Should not happen if os.walk starts within input_path
                 print(f"Warning: Cannot determine relative path for {abs_root_path}. Using level 0.")
                 level = 0

            indent = '    ' * level # 4 spaces per level
            # Add directory entry to tree (only if it's not the root itself processed initially)
            if abs_root_path != Path(self.input_path).resolve(): # Don't print root '/'
               tree += f"{indent}{abs_root_path.name}/\n"
            elif level == 0 and not tree: # Print root marker only once at the start
               tree += f"{Path(self.input_path).name}/\n"


            # --- File Listing ---
            subindent = '    ' * (level + 1)
            sorted_files = sorted(files) # Sort files for consistent output

            for f in sorted_files:
                file_path = abs_root_path / f
                # Check if file is ignored
                if not self._is_path_ignored(str(file_path)):
                     tree += f"{subindent}{f}\n"
                elif self.verbose:
                     # Note: _is_path_ignored already prints detailed reasons if verbose > 1
                     print(f"Ignoring file (in tree): {file_path.relative_to(self.input_path)}")


        if self.verbose:
            print(f"\n--- Generated File Tree ---\n{tree}")
            print("--- End File Tree ---\n")

        return tree

    def _get_file_contents(self, file_path):
        """Reads file content, handling potential encoding issues."""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            try:
                # Try a fallback encoding (e.g., latin-1 or detected encoding)
                with open(file_path, 'r', encoding='latin-1') as file:
                   if self.verbose: print(f"Warning: Used fallback encoding 'latin-1' for {file_path}")
                   return file.read()
            except Exception as e:
                 print(f"Error: Could not read file {file_path} with utf-8 or latin-1: {e}")
                 return f"Error reading file: {e}" # Include error message in output
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
            return f"Error reading file: {e}"


    def _process_files(self, path_to_walk):
        """Walks through files and concatenates their content, respecting ignores."""
        content = ""
        base_path = Path(self.input_path).resolve() # Use resolved base path

        for root, dirs, files in os.walk(path_to_walk, topdown=True):
            abs_root_path = Path(root).resolve()

            # --- Directory Pruning (same logic as in _parse_folder) ---
            original_dirs = list(dirs)
            dirs[:] = []
            for d in original_dirs:
                dir_path = abs_root_path / d
                if d == ".git" or self._is_path_ignored(str(dir_path)):
                    continue # Skip ignored or .git dirs
                else:
                    dirs.append(d)

            # --- Process Files in Current Directory ---
            sorted_files = sorted(files)
            for file_name in sorted_files:
                file_path = abs_root_path / file_name
                str_file_path = str(file_path)

                # Skip ignored files
                if self._is_path_ignored(str_file_path):
                    if self.verbose:
                         print(f"Ignoring file (content): {file_path.relative_to(base_path)}")
                    continue

                # Try to get content
                try:
                    if self.verbose:
                        print(f"Processing: {file_path.relative_to(base_path)}")

                    file_content = self._get_file_contents(str_file_path)
                    rel_file_path_display = file_path.relative_to(base_path).as_posix() # Display relative path

                    content += f"\n\n--- File: {rel_file_path_display} ---\n"
                    # Optional: Add file type hint
                    # content += f"File type: {os.path.splitext(file_name)[1]}\n\n"
                    content += file_content
                    # Use a clear end marker
                    content += f"\n--- End File: {rel_file_path_display} ---\n"

                except Exception as e: # Catch potential errors during processing
                    print(f"Couldn't process {file_path.relative_to(base_path)}: {e}")
                    content += f"\n\n--- Error processing file: {file_path.relative_to(base_path)} --- \n {e} \n--- End Error ---\n"

        return content

    def get_text(self):
        """Generates the final combined text output."""
        # --- Decide whether to clone or use local path ---
        process_path = self.input_path # Default to local path
        if self.is_github_repo():
             success = self._clone_github_repo()
             if success:
                 process_path = self.temp_folder_path
                 self.is_cloned_repo = True
                 # Re-initialize ignores for the cloned repo location
                 self.input_path = process_path # Temporarily change base for ignore checks
                 self._initialize_ignores()
                 print(f"Processing cloned repo at: {process_path}")
             else:
                 print("Error: Failed to clone GitHub repository. Aborting.")
                 # Reset input_path if cloning failed and we modified it
                 if self.is_cloned_repo: self.input_path = os.path.dirname(self.temp_folder_path) # Hacky way to get original path back conceptually
                 return "Error: Could not clone repository." # Return error message
        else:
             print(f"Processing local path: {process_path}")
             # Ensure ignores are initialized for the local path (done in __init__)


        # --- Generate Structure and Content ---
        folder_structure = self._parse_folder(process_path)
        file_contents = self._process_files(process_path)

        # --- Assemble Final Output ---
        folder_structure_header = "--- Folder Structure ---"
        file_contents_header = "--- File Contents ---"
        delimiter = "=" * 60 # Use a more prominent delimiter

        # Restore original input_path if it was changed for cloning
        if self.is_cloned_repo:
             # This assumes the original input_path wasn't needed after _initialize_ignores
             # A cleaner way might be to pass the base path explicitly to ignore checkers
             pass # No need to restore if input_path wasn't critical after cloning

        return (
            f"{folder_structure_header}\n{folder_structure}\n{delimiter}\n\n"
            f"{file_contents_header}\n{file_contents}\n{delimiter}\nEnd of Codebase\n{delimiter}"
        )

    def get_file(self):
        """Gets the text and saves it to the specified output file."""
        text_content = self.get_text()

        # Check for error during get_text (e.g., cloning failure)
        if text_content.startswith("Error:"):
             print(text_content) # Print the error
             # Optionally, clean up temp folder even on error
             self.clean_up_temp_folder()
             return # Exit without writing file

        try:
            # Ensure output directory exists
            output_dir = os.path.dirname(self.output_path)
            if output_dir: # Handle case where output is in current dir
                os.makedirs(output_dir, exist_ok=True)

            if self.output_type == "txt":
                with open(self.output_path, "w", encoding='utf-8') as file:
                    file.write(text_content)
            elif self.output_type == "docx":
                doc = Document()
                # Add text respecting paragraphs (simple split, might need refinement)
                # Consider adding as preformatted text run if python-docx supports it well
                for paragraph in text_content.split('\n'):
                    doc.add_paragraph(paragraph)
                doc.save(self.output_path)
            else:
                 # Should be caught by argparse choices usually, but good to have
                raise ValueError(f"Invalid output type '{self.output_type}'. Supported types: txt, docx")

            print(f"\nSuccessfully generated {self.output_type} file: {self.output_path}")

        except Exception as e:
            print(f"\nError writing output file {self.output_path}: {e}")

        finally:
             # Clean up temp folder regardless of writing success/failure
            self.clean_up_temp_folder()


    #### GitHub ####
    def _clone_github_repo(self):
        """Clones the repo to a temporary directory."""
        try:
            # Create temp dir *before* cloning into it
            # Use a more descriptive prefix/suffix if desired
            self.temp_folder_path = tempfile.mkdtemp(prefix="cbt_repo_")
            print(f"Cloning {self.input_path} into temporary folder {self.temp_folder_path}...")
            git.Repo.clone_from(self.input_path, self.temp_folder_path)

            # Important: Update self.input_path to the temp folder *for processing*
            # self.input_path = self.temp_folder_path # Now done within get_text
            if self.verbose:
                print("GitHub repository cloned successfully.")
            return True # Indicate success
        except git.GitCommandError as e:
             print(f"Error cloning GitHub repository: {e}")
             # Clean up failed clone attempt
             self.clean_up_temp_folder() # Ensure cleanup even on clone failure
             self.temp_folder_path = None # Reset path
             return False # Indicate failure
        except Exception as e:
            print(f"An unexpected error occurred during cloning: {e}")
            self.clean_up_temp_folder() # Ensure cleanup
            self.temp_folder_path = None # Reset path
            return False # Indicate failure

    def is_github_repo(self):
        """Checks if the input path looks like a common Git repo URL."""
        # Keep it simple, add more patterns if needed
        return self.input_path.startswith(("https://github.com/", "git@github.com:", "https://gitlab.com/", "git@gitlab.com:", "https://bitbucket.org/", "git@bitbucket.org:")) \
                or self.input_path.endswith(".git") # Common convention for clone URLs

    def clean_up_temp_folder(self):
        """Removes the temporary folder if it was created."""
        if self.temp_folder_path and os.path.exists(self.temp_folder_path):
            try:
                shutil.rmtree(self.temp_folder_path)
                if self.verbose:
                    print(f"Cleaned up temporary folder: {self.temp_folder_path}")
                self.temp_folder_path = None # Reset path after successful removal
            except Exception as e:
                print(f"Warning: Could not remove temporary folder {self.temp_folder_path}: {e}")

# --- Main Execution ---
def main():
    parser = argparse.ArgumentParser(
        description="Generate a single text or docx file from a codebase, respecting .gitignore and custom ignore rules.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter # Show defaults
    )
    parser.add_argument(
        "input",
        help="Input path (local folder path or Git repository URL)."
    )
    parser.add_argument(
        "-o", "--output",
        required=True,
        help="Output file path (e.g., my_codebase.txt or output/report.docx)."
    )
    parser.add_argument(
        "-t", "--output_type",
        choices=["txt", "docx"],
        default="txt",
        help="Output file type."
    )
    parser.add_argument(
        "--ignore",
        nargs='*', default=[], # Accept zero or more ignore patterns
        help="List of additional paths/patterns to ignore (e.g., 'dist/' '/node_modules' '*.log' 'config. Maches relative to the input path root."
    )
    parser.add_argument(
        "--exclude_hidden",
        action="store_true",
        help="Exclude files and folders starting with '.' or '__'. Note that .gitignore or custom ignores might already cover these."
    )
    parser.add_argument(
        "-v", "--verbose",
        action="count", default=0, # Use count for verbosity levels (0, 1, 2)
        help="Increase output verbosity. -v for basic info, -vv for detailed ignore reasons."
    )
    args = parser.parse_args()

    # Basic validation
    if not args.input:
         parser.error("Input path cannot be empty.")
    if not args.output:
        parser.error("Output path cannot be empty.")


    code_to_text = None # Ensure it's defined for finally block
    try:
        code_to_text = CodebaseToText(
            input_path=args.input,
            output_path=args.output,
            output_type=args.output_type,
            verbose=args.verbose,
            exclude_hidden=args.exclude_hidden,
            ignored_paths=args.ignore # Pass the list here
        )
        code_to_text.get_file()

    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")
        # Attempt cleanup even if initialization failed partially
        if code_to_text:
            code_to_text.clean_up_temp_folder()
    # No finally block needed here as get_file() now handles cleanup

def _sql_dump(database, export_file):
    command = [
        r'd:\xampp\mysql\bin\mysqldump.exe',
        '-u', 'root',
        '--no-data',
        database
    ]

    with open(export_file, 'w') as output_file:
        result = subprocess.run(command, stdout=output_file, stderr=subprocess.PIPE)

    if(result.returncode == 0):
        print(f"Schema dump successful: {export_file}")
    else:
        print("Error occurred:", result.stderr.decode())


if __name__ == "__main__":
    # --- Example Usage (replace with main() for CLI) ---

    # To run from command line, save the script (e.g., codebase_to_text.py) and run:
    # python codebase_to_text.py . -o my_project.txt --ignore "dist/" "*.tmp" "/tests/data/" --exclude_hidden -v
    # python codebase_to_text.py https://github.com/user/repo.git -o repo_code.docx -t docx -vv

    # --- Direct call example (useful for testing) ---
    try:
        print("Running direct example...")
        # Example: Process current directory, output to output.txt, ignore 'venv' folder and all '.log' files
        example_ignores = ["venv/", "*.log", "/output.txt", ".git/", "__pycache__/"] # Add common ignores
        example_ignores.append("public/css/")
        example_ignores.append("codebase-to-text.py")
        converter = CodebaseToText(
            input_path=".",
            output_path="_codebase_output.txt",
            output_type="txt",
            verbose=0, # Set verbosity level (0, 1, or 2)
            exclude_hidden=True,
            ignored_paths=example_ignores
        )
        converter.get_file()
        print("Direct example finished.")

        _sql_dump('tp_servicedesk', '_codebase_schemafile.sql')
    except Exception as e:
         print(f"Error running direct example: {e}")

    # Uncomment the line below to enable command-line argument parsing when running the script directly
    # main()

    # to get a sql dump use somthing similar to
    # d:\xampp\mysql\bin\mysqldump.exe -u root --no-data tp_servicedesk > _codebase_schemafile.sql