diff --git a/.gitignore b/.gitignore index 99dfff9..4c48363 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ go.work *.zip text.txt text.json +git2gpt diff --git a/README.md b/README.md index 6a0fab2..ca02121 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,8 @@ git2gpt is a command-line utility that converts a Git repository to text for loading into ChatGPT and other NLP models. The output text file represents the Git repository in a structured format. You can also add a `.gptignore` file to your repos to have git2gpt ignore certain files. The text is prefixed with a preamble that explains to the AI what the text is: - > The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context. - ## Installation First, make sure you have the Go programming language installed on your system. You can download it from [the official Go website](https://golang.org/dl/). @@ -32,12 +30,13 @@ By default, your `.git` directory and your `.gitignore` files are ignored. Any f ### Flags -* `-p`, `--preamble`: Path to a text file containing a preamble to include at the beginning of the output file. -* `-o`, `--output`: Path to the output file. If not specified, will print to standard output. -* `-e`, `--estimate`: Estimate the tokens of the output file. If not specified, does not estimate. -* `-j`, `--json`: Output to JSON rather than plain text. Use with `-o` to specify the output file. -* `-i`, `--ignore`: Path to the `.gptignore` file. If not specified, will look for a `.gptignore` file in the same directory as the `.gitignore` file. -* `-g`, `--ignore-gitignore`: Ignore the `.gitignore` file. +* `-p`, `--preamble`: Path to a text file containing a preamble to include at the beginning of the output file. +* `-o`, `--output`: Path to the output file. If not specified, will print to standard output. +* `-e`, `--estimate`: Estimate the tokens of the output file. If not specified, does not estimate. +* `-j`, `--json`: Output to JSON rather than plain text. Use with `-o` to specify the output file. +* `-i`, `--ignore`: Path to the `.gptignore` file. If not specified, will look for a `.gptignore` file in the same directory as the `.gitignore` file. +* `-g`, `--ignore-gitignore`: Ignore the `.gitignore` file. +* `-s`, `--scrub-comments`: Remove comments from the output file to save tokens. ## Contributing diff --git a/cmd/root.go b/cmd/root.go index 2bde45e..80cd3ae 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -16,6 +16,7 @@ var ignoreFilePath string var ignoreGitignore bool var outputJSON bool var debug bool +var scrubComments bool var rootCmd = &cobra.Command{ Use: "git2gpt [flags] /path/to/git/repository", @@ -30,7 +31,7 @@ var rootCmd = &cobra.Command{ os.Exit(1) } if outputJSON { - output, err := prompt.MarshalRepo(repo) + output, err := prompt.MarshalRepo(repo, scrubComments) if err != nil { fmt.Printf("Error: %s\n", err) os.Exit(1) @@ -53,7 +54,7 @@ var rootCmd = &cobra.Command{ } return } - output, err := prompt.OutputGitRepo(repo, preambleFile) + output, err := prompt.OutputGitRepo(repo, preambleFile, scrubComments) if err != nil { fmt.Printf("Error: %s\n", err) os.Exit(1) @@ -94,6 +95,8 @@ func init() { rootCmd.Flags().BoolVarP(&outputJSON, "json", "j", false, "output JSON") // debug. Should be a bool rootCmd.Flags().BoolVarP(&debug, "debug", "d", false, "debug mode. Do not output to standard output") + // scrub comments. Should be a bool + rootCmd.Flags().BoolVarP(&scrubComments, "scrub-comments", "s", false, "scrub comments from the output. Decreases token count") } func Execute() { diff --git a/go.mod b/go.mod index a067afe..c14e281 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,9 @@ require ( ) require ( + github.com/dlclark/regexp2 v1.10.0 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/inconshreveable/mousetrap v1.0.1 // indirect + github.com/pkoukk/tiktoken-go v0.1.6 // indirect github.com/spf13/pflag v1.0.5 // indirect ) diff --git a/go.sum b/go.sum index 92875a5..fb4fa5e 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,14 @@ github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= +github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc= github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/pkoukk/tiktoken-go v0.1.6 h1:JF0TlJzhTbrI30wCvFuiw6FzP2+/bR+FIxUdgEAcUsw= +github.com/pkoukk/tiktoken-go v0.1.6/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA= github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY= diff --git a/prompt/prompt.go b/prompt/prompt.go index 57a079a..b1facb6 100644 --- a/prompt/prompt.go +++ b/prompt/prompt.go @@ -4,13 +4,14 @@ import ( "bufio" "encoding/json" "fmt" - "math" "os" "path/filepath" "strings" "unicode/utf8" + "github.com/chand1012/git2gpt/utils" "github.com/gobwas/glob" + "github.com/pkoukk/tiktoken-go" ) // GitFile is a file in a Git repository @@ -89,7 +90,7 @@ func GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []st // .gptignore file exists ignoreList, _ = getIgnoreList(ignoreFilePath) } - ignoreList = append(ignoreList, ".git/**", ".gitignore") + ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore") if useGitignore { gitignorePath := filepath.Join(repoPath, ".gitignore") @@ -131,7 +132,7 @@ func ProcessGitRepo(repoPath string, ignoreList []string) (*GitRepo, error) { } // OutputGitRepo outputs a Git repository to a text file -func OutputGitRepo(repo *GitRepo, preambleFile string) (string, error) { +func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (string, error) { var repoBuilder strings.Builder if preambleFile != "" { @@ -148,6 +149,9 @@ func OutputGitRepo(repo *GitRepo, preambleFile string) (string, error) { for _, file := range repo.Files { repoBuilder.WriteString("----\n") repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Path)) + if scrubComments { + file.Contents = utils.RemoveCodeComments(file.Contents) + } repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Contents)) } @@ -160,9 +164,9 @@ func OutputGitRepo(repo *GitRepo, preambleFile string) (string, error) { return output, nil } -func MarshalRepo(repo *GitRepo) ([]byte, error) { +func MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) { // run the output function to get the total tokens - _, err := OutputGitRepo(repo, "") + _, err := OutputGitRepo(repo, "", scrubComments) if err != nil { return nil, fmt.Errorf("error marshalling repo: %w", err) } @@ -208,9 +212,12 @@ func processRepository(repoPath string, ignoreList []string, repo *GitRepo) erro // EstimateTokens estimates the number of tokens in a string func EstimateTokens(output string) int64 { - tokenCount := float64(len(output)) - // divide by 3.5 to account for the fact that GPT-4 uses (roughly) 3.5 tokens per character - tokenCount = tokenCount / 3.5 - // round up to the nearest integer - return int64(math.Ceil(tokenCount)) + tke, err := tiktoken.GetEncoding("cl100k_base") + if err != nil { + fmt.Println("Error getting encoding:", err) + return 0 + } + + tokens := tke.Encode(output, nil, nil) + return int64(len(tokens)) } diff --git a/utils/strings.go b/utils/strings.go new file mode 100644 index 0000000..9841ea4 --- /dev/null +++ b/utils/strings.go @@ -0,0 +1,41 @@ +package utils + +import ( + "bufio" + "fmt" + "regexp" + "strings" +) + +// RemoveCodeComments removes single-line and multiline comments from the provided code string. +func RemoveCodeComments(code string) string { + // Regex for single-line comments. + singleLineCommentRegex := regexp.MustCompile(`^\s*(//|#|--|`) + + // Use a scanner to process each line of the input string. + var result strings.Builder + scanner := bufio.NewScanner(strings.NewReader(code)) + for scanner.Scan() { + line := scanner.Text() + // First remove multiline comments as they may span across multiple lines. + line = multiLineCommentRegex.ReplaceAllString(line, "") + // Then remove any single-line comment parts that remain. + cleanLine := singleLineCommentRegex.ReplaceAllString(line, "") + if cleanLine != "" { + // Write the cleaned line to the result, preserving original line breaks. + result.WriteString(cleanLine + "\n") + } + } + + if err := scanner.Err(); err != nil { + fmt.Fprintln(&result, "Error reading input:", err) + } + + // Additional cleanup in case of multiline comments spanning across multiple scanned lines. + finalCleanedCode := multiLineCommentRegex.ReplaceAllString(result.String(), "") + + return strings.TrimRight(finalCleanedCode, "\n") +}