Addedd text parsing and refractor

arnab2001 · arnab2001 · commit 936718581c53 · 2024-07-04T02:59:15.000+05:30
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,9 +1,10 @@
 {
-  "name": "git-repo-to-json",
-  "version": "2.0.5",
+  "name": "git-repo-parser",
+  "version": "2.0.6",
   "description": "A tool to scrape all files from a GitHub repository and turn it into a JSON file",
   "bin": {
-    "git-repo-to-json": "dist/cli.js"
+    "git-repo-to-json": "dist/clijson.js",
+    "git-repo-to-text": "dist/clitext.js"
   },
   "files": [
     "dist"
diff --git a/src/clijson.ts b/src/clijson.ts
@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 
-import { scrapeRepository } from './index';
+import { scrapeRepositoryToJson } from "./scraper";
 import * as fs from 'fs';
 
 async function main() {
@@ -12,7 +12,7 @@ async function main() {
     }
 
     // Scrape the repository and get the result
-    const result = await scrapeRepository(repoUrl);
+    const result = await scrapeRepositoryToJson(repoUrl);
     const jsonResult = JSON.stringify(result, null, 2);
 
     // Write the JSON to a file
diff --git a/src/clitext.ts b/src/clitext.ts
@@ -0,0 +1,22 @@
+#!/usr/bin/env node
+
+import { scrapeRepositoryToPlainText } from "./scraper";
+import * as fs from 'fs';
+
+async function main() {
+    const repoUrl = process.argv[2]; // Get the repository URL from command line arguments
+
+    if (!repoUrl) {
+        console.error('Please provide a GitHub repository URL.');
+        process.exit(1);
+    }
+
+    // Scrape the repository and get the result
+    const result = await scrapeRepositoryToPlainText(repoUrl);
+
+    // Write the JSON to a file
+    fs.writeFileSync('files.txt', result);
+    console.log('File list has been saved to files.text');
+}
+
+main().catch(err => console.error(err));
diff --git a/src/index.ts b/src/index.ts
@@ -1,70 +1,3 @@
-import * as fs from 'fs';
-import * as path from 'path';
-import simpleGit from 'simple-git';
+import { scrapeRepositoryToJson, scrapeRepositoryToPlainText } from './scraper';
 
-export interface FileData {
-    name: string;
-    path: string;
-    type: 'file' | 'directory';
-    children?: FileData[];
-    content?: string;
-}
-
-async function cloneRepository(repoUrl: string, clonePath: string) {
-    const git = simpleGit();
-    await git.clone(repoUrl, clonePath);
-    console.log(`Repository cloned to ${clonePath}`);
-}
-
-function scrapeDirectory(dir: string, ignorePatterns: string[] = []): FileData[] {
-    const files = fs.readdirSync(dir);
-    return files.filter(file => {
-        const filePath = path.join(dir, file);
-        return !ignorePatterns.some(pattern => filePath.includes(pattern));
-    }).map(file => {
-        const filePath = path.join(dir, file);
-        const stat = fs.statSync(filePath);
-
-        if (stat.isDirectory()) {
-            // Ignore the .git directory
-            if (file === '.git') {
-                return null;
-            }
-            return {
-                name: file,
-                path: filePath,
-                type: 'directory',
-                children: scrapeDirectory(filePath, ignorePatterns)
-            };
-        } else {
-            const content = fs.readFileSync(filePath, 'utf-8');
-            return {
-                name: file,
-                path: filePath,
-                type: 'file',
-                content: content
-            };
-        }
-    }).filter(item => item !== null) as FileData[];
-}
-
-export async function scrapeRepository(repoUrl: string): Promise<FileData[]> {
-    const repoName = repoUrl.split('/').pop()?.replace('.git', '');
-    if (!repoName) {
-        throw new Error('Invalid repository URL');
-    }
-    const clonePath = `./${repoName}`; // Directory where the repository will be cloned
-
-    // Clone the repository
-    await cloneRepository(repoUrl, clonePath);
-
-    // Scrape the cloned repository directory
-    const ignorePatterns = ['.git'];
-    const result = scrapeDirectory(clonePath, ignorePatterns);
-
-    // Clean up the cloned repository
-    fs.rmdirSync(clonePath, { recursive: true });
-    console.log('Cloned repository directory removed');
-
-    return result;
-}
+export { scrapeRepositoryToJson, scrapeRepositoryToPlainText };
diff --git a/src/scraper.ts b/src/scraper.ts
@@ -0,0 +1,159 @@
+import * as fs from 'fs';
+import * as path from 'path';
+import simpleGit from 'simple-git';
+
+export interface FileData {
+    name: string;
+    path: string;
+    type: 'file' | 'directory';
+    children?: FileData[];
+    content?: string;
+}
+
+async function cloneRepository(repoUrl: string, clonePath: string) {
+    const git = simpleGit();
+    await git.clone(repoUrl, clonePath);
+    console.log(`Repository cloned to ${clonePath}`);
+}
+
+function shouldIgnoreFile(fileName: string): boolean {
+    const lowerCaseFileName = fileName.toLowerCase();
+    return (
+        lowerCaseFileName === 'package-lock.json' ||
+        lowerCaseFileName.endsWith('.pdf') ||
+        lowerCaseFileName.endsWith('.png') ||
+        lowerCaseFileName.endsWith('.jpg') ||
+        lowerCaseFileName.endsWith('.jpeg') ||
+        lowerCaseFileName.endsWith('.gif') ||
+        lowerCaseFileName.endsWith('.ico') ||
+        lowerCaseFileName.endsWith('.svg') ||
+        lowerCaseFileName.endsWith('.woff') ||
+        lowerCaseFileName.endsWith('.woff2') ||
+        lowerCaseFileName.endsWith('.eot') ||
+        lowerCaseFileName.endsWith('.ttf') ||
+        lowerCaseFileName.endsWith('.otf') ||
+        lowerCaseFileName.endsWith('.mp4') ||
+        lowerCaseFileName.endsWith('.avi') ||
+        lowerCaseFileName.endsWith('.webm') ||
+        lowerCaseFileName.endsWith('.mov') ||
+        lowerCaseFileName.endsWith('.mp3') ||
+        lowerCaseFileName.endsWith('.wav') ||
+        lowerCaseFileName.endsWith('.flac') ||
+        lowerCaseFileName.endsWith('.ogg') ||
+        lowerCaseFileName.endsWith('.webp') ||
+        lowerCaseFileName.startsWith('package-lock') ||
+        lowerCaseFileName.startsWith('yarn-lock') ||
+        lowerCaseFileName.startsWith('npm-debug') ||
+        lowerCaseFileName.startsWith('yarn-debug') ||
+        lowerCaseFileName.startsWith('yarn-error') ||
+        lowerCaseFileName.startsWith('tsconfig') ||
+        lowerCaseFileName.startsWith('jest.config') 
+
+        // Add more extensions as needed
+    );
+}
+
+function scrapeDirectoryToJson(dir: string, ignorePatterns: string[] = []): FileData[] {
+    const files = fs.readdirSync(dir);
+    return files.filter(file => {
+        const filePath = path.join(dir, file);
+        return (
+            !ignorePatterns.some(pattern => filePath.includes(pattern)) &&
+            !shouldIgnoreFile(file)
+        );
+    }).map(file => {
+        const filePath = path.join(dir, file);
+        const stat = fs.statSync(filePath);
+
+        if (stat.isDirectory()) {
+            // Ignore the .git directory
+            if (file === '.git') {
+                return null;
+            }
+            return {
+                name: file,
+                path: filePath,
+                type: 'directory',
+                children: scrapeDirectoryToJson(filePath, ignorePatterns)
+            };
+        } else {
+            const content = fs.readFileSync(filePath, 'utf-8');
+            return {
+                name: file,
+                path: filePath,
+                type: 'file',
+                content: content
+            };
+        }
+    }).filter(item => item !== null) as FileData[];
+}
+
+function scrapeDirectoryToPlainText(dir: string, ignorePatterns: string[] = [], prefix: string = ''): string {
+    let result = '';
+
+    const files = fs.readdirSync(dir);
+    files.forEach(file => {
+        const filePath = path.join(dir, file);
+
+        if (ignorePatterns.some(pattern => filePath.includes(pattern)) || shouldIgnoreFile(file)) {
+            return;
+        }
+
+        const stat = fs.statSync(filePath);
+
+        if (stat.isDirectory()) {
+            // Ignore the .git directory
+            if (file === '.git') {
+                return;
+            }
+            result += scrapeDirectoryToPlainText(filePath, ignorePatterns, path.join(prefix, file));
+        } else {
+            const content = fs.readFileSync(filePath, 'utf-8');
+            result += `${path.join(prefix, file)}\n${content}\n\n`;
+        }
+    });
+
+    return result;
+}
+
+export async function scrapeRepositoryToJson(repoUrl: string): Promise<FileData[]> {
+    const repoName = repoUrl.split('/').pop()?.replace('.git', '');
+    if (!repoName) {
+        throw new Error('Invalid repository URL');
+    }
+    const clonePath = `./${repoName}`; // Directory where the repository will be cloned
+
+    // Clone the repository
+    await cloneRepository(repoUrl, clonePath);
+
+    // Scrape the cloned repository directory
+    const ignorePatterns = ['.git'];
+    const result = scrapeDirectoryToJson(clonePath, ignorePatterns);
+
+    // Clean up the cloned repository
+    fs.rmdirSync(clonePath, { recursive: true });
+    console.log('Cloned repository directory removed');
+
+    return result;
+}
+
+export async function scrapeRepositoryToPlainText(repoUrl: string): Promise<string> {
+    const repoName = repoUrl.split('/').pop()?.replace('.git', '');
+    if (!repoName) {
+        throw new Error('Invalid repository URL');
+    }
+    const clonePath = `./${repoName}`; // Directory where the repository will be cloned
+
+    // Clone the repository
+    await cloneRepository(repoUrl, clonePath);
+
+    // Scrape the cloned repository directory
+    const ignorePatterns = ['.git'];
+    const result = scrapeDirectoryToPlainText(clonePath, ignorePatterns);
+
+    // Clean up the cloned repository
+    fs.rmdirSync(clonePath, { recursive: true });
+    console.log('Cloned repository directory removed');
+
+    return result;
+}