Skip to content

Commit 9367185

Browse files
committed
Addedd text parsing and refractor
1 parent 5c82c63 commit 9367185

File tree

6 files changed

+195
-79
lines changed

6 files changed

+195
-79
lines changed

package-lock.json

+6-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
{
2-
"name": "git-repo-to-json",
3-
"version": "2.0.5",
2+
"name": "git-repo-parser",
3+
"version": "2.0.6",
44
"description": "A tool to scrape all files from a GitHub repository and turn it into a JSON file",
55
"bin": {
6-
"git-repo-to-json": "dist/cli.js"
6+
"git-repo-to-json": "dist/clijson.js",
7+
"git-repo-to-text": "dist/clitext.js"
78
},
89
"files": [
910
"dist"

src/cli.ts renamed to src/clijson.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env node
22

3-
import { scrapeRepository } from './index';
3+
import { scrapeRepositoryToJson } from "./scraper";
44
import * as fs from 'fs';
55

66
async function main() {
@@ -12,7 +12,7 @@ async function main() {
1212
}
1313

1414
// Scrape the repository and get the result
15-
const result = await scrapeRepository(repoUrl);
15+
const result = await scrapeRepositoryToJson(repoUrl);
1616
const jsonResult = JSON.stringify(result, null, 2);
1717

1818
// Write the JSON to a file

src/clitext.ts

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env node
2+
3+
import { scrapeRepositoryToPlainText } from "./scraper";
4+
import * as fs from 'fs';
5+
6+
async function main() {
7+
const repoUrl = process.argv[2]; // Get the repository URL from command line arguments
8+
9+
if (!repoUrl) {
10+
console.error('Please provide a GitHub repository URL.');
11+
process.exit(1);
12+
}
13+
14+
// Scrape the repository and get the result
15+
const result = await scrapeRepositoryToPlainText(repoUrl);
16+
17+
// Write the JSON to a file
18+
fs.writeFileSync('files.txt', result);
19+
console.log('File list has been saved to files.text');
20+
}
21+
22+
main().catch(err => console.error(err));

src/index.ts

+2-69
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,3 @@
1-
import * as fs from 'fs';
2-
import * as path from 'path';
3-
import simpleGit from 'simple-git';
1+
import { scrapeRepositoryToJson, scrapeRepositoryToPlainText } from './scraper';
42

5-
export interface FileData {
6-
name: string;
7-
path: string;
8-
type: 'file' | 'directory';
9-
children?: FileData[];
10-
content?: string;
11-
}
12-
13-
async function cloneRepository(repoUrl: string, clonePath: string) {
14-
const git = simpleGit();
15-
await git.clone(repoUrl, clonePath);
16-
console.log(`Repository cloned to ${clonePath}`);
17-
}
18-
19-
function scrapeDirectory(dir: string, ignorePatterns: string[] = []): FileData[] {
20-
const files = fs.readdirSync(dir);
21-
return files.filter(file => {
22-
const filePath = path.join(dir, file);
23-
return !ignorePatterns.some(pattern => filePath.includes(pattern));
24-
}).map(file => {
25-
const filePath = path.join(dir, file);
26-
const stat = fs.statSync(filePath);
27-
28-
if (stat.isDirectory()) {
29-
// Ignore the .git directory
30-
if (file === '.git') {
31-
return null;
32-
}
33-
return {
34-
name: file,
35-
path: filePath,
36-
type: 'directory',
37-
children: scrapeDirectory(filePath, ignorePatterns)
38-
};
39-
} else {
40-
const content = fs.readFileSync(filePath, 'utf-8');
41-
return {
42-
name: file,
43-
path: filePath,
44-
type: 'file',
45-
content: content
46-
};
47-
}
48-
}).filter(item => item !== null) as FileData[];
49-
}
50-
51-
export async function scrapeRepository(repoUrl: string): Promise<FileData[]> {
52-
const repoName = repoUrl.split('/').pop()?.replace('.git', '');
53-
if (!repoName) {
54-
throw new Error('Invalid repository URL');
55-
}
56-
const clonePath = `./${repoName}`; // Directory where the repository will be cloned
57-
58-
// Clone the repository
59-
await cloneRepository(repoUrl, clonePath);
60-
61-
// Scrape the cloned repository directory
62-
const ignorePatterns = ['.git'];
63-
const result = scrapeDirectory(clonePath, ignorePatterns);
64-
65-
// Clean up the cloned repository
66-
fs.rmdirSync(clonePath, { recursive: true });
67-
console.log('Cloned repository directory removed');
68-
69-
return result;
70-
}
3+
export { scrapeRepositoryToJson, scrapeRepositoryToPlainText };

src/scraper.ts

+159
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import * as fs from 'fs';
2+
import * as path from 'path';
3+
import simpleGit from 'simple-git';
4+
5+
export interface FileData {
6+
name: string;
7+
path: string;
8+
type: 'file' | 'directory';
9+
children?: FileData[];
10+
content?: string;
11+
}
12+
13+
async function cloneRepository(repoUrl: string, clonePath: string) {
14+
const git = simpleGit();
15+
await git.clone(repoUrl, clonePath);
16+
console.log(`Repository cloned to ${clonePath}`);
17+
}
18+
19+
function shouldIgnoreFile(fileName: string): boolean {
20+
const lowerCaseFileName = fileName.toLowerCase();
21+
return (
22+
lowerCaseFileName === 'package-lock.json' ||
23+
lowerCaseFileName.endsWith('.pdf') ||
24+
lowerCaseFileName.endsWith('.png') ||
25+
lowerCaseFileName.endsWith('.jpg') ||
26+
lowerCaseFileName.endsWith('.jpeg') ||
27+
lowerCaseFileName.endsWith('.gif') ||
28+
lowerCaseFileName.endsWith('.ico') ||
29+
lowerCaseFileName.endsWith('.svg') ||
30+
lowerCaseFileName.endsWith('.woff') ||
31+
lowerCaseFileName.endsWith('.woff2') ||
32+
lowerCaseFileName.endsWith('.eot') ||
33+
lowerCaseFileName.endsWith('.ttf') ||
34+
lowerCaseFileName.endsWith('.otf') ||
35+
lowerCaseFileName.endsWith('.mp4') ||
36+
lowerCaseFileName.endsWith('.avi') ||
37+
lowerCaseFileName.endsWith('.webm') ||
38+
lowerCaseFileName.endsWith('.mov') ||
39+
lowerCaseFileName.endsWith('.mp3') ||
40+
lowerCaseFileName.endsWith('.wav') ||
41+
lowerCaseFileName.endsWith('.flac') ||
42+
lowerCaseFileName.endsWith('.ogg') ||
43+
lowerCaseFileName.endsWith('.webp') ||
44+
lowerCaseFileName.startsWith('package-lock') ||
45+
lowerCaseFileName.startsWith('yarn-lock') ||
46+
lowerCaseFileName.startsWith('npm-debug') ||
47+
lowerCaseFileName.startsWith('yarn-debug') ||
48+
lowerCaseFileName.startsWith('yarn-error') ||
49+
lowerCaseFileName.startsWith('tsconfig') ||
50+
lowerCaseFileName.startsWith('jest.config')
51+
52+
// Add more extensions as needed
53+
);
54+
}
55+
56+
function scrapeDirectoryToJson(dir: string, ignorePatterns: string[] = []): FileData[] {
57+
const files = fs.readdirSync(dir);
58+
return files.filter(file => {
59+
const filePath = path.join(dir, file);
60+
return (
61+
!ignorePatterns.some(pattern => filePath.includes(pattern)) &&
62+
!shouldIgnoreFile(file)
63+
);
64+
}).map(file => {
65+
const filePath = path.join(dir, file);
66+
const stat = fs.statSync(filePath);
67+
68+
if (stat.isDirectory()) {
69+
// Ignore the .git directory
70+
if (file === '.git') {
71+
return null;
72+
}
73+
return {
74+
name: file,
75+
path: filePath,
76+
type: 'directory',
77+
children: scrapeDirectoryToJson(filePath, ignorePatterns)
78+
};
79+
} else {
80+
const content = fs.readFileSync(filePath, 'utf-8');
81+
return {
82+
name: file,
83+
path: filePath,
84+
type: 'file',
85+
content: content
86+
};
87+
}
88+
}).filter(item => item !== null) as FileData[];
89+
}
90+
91+
function scrapeDirectoryToPlainText(dir: string, ignorePatterns: string[] = [], prefix: string = ''): string {
92+
let result = '';
93+
94+
const files = fs.readdirSync(dir);
95+
files.forEach(file => {
96+
const filePath = path.join(dir, file);
97+
98+
if (ignorePatterns.some(pattern => filePath.includes(pattern)) || shouldIgnoreFile(file)) {
99+
return;
100+
}
101+
102+
const stat = fs.statSync(filePath);
103+
104+
if (stat.isDirectory()) {
105+
// Ignore the .git directory
106+
if (file === '.git') {
107+
return;
108+
}
109+
result += scrapeDirectoryToPlainText(filePath, ignorePatterns, path.join(prefix, file));
110+
} else {
111+
const content = fs.readFileSync(filePath, 'utf-8');
112+
result += `${path.join(prefix, file)}\n${content}\n\n`;
113+
}
114+
});
115+
116+
return result;
117+
}
118+
119+
export async function scrapeRepositoryToJson(repoUrl: string): Promise<FileData[]> {
120+
const repoName = repoUrl.split('/').pop()?.replace('.git', '');
121+
if (!repoName) {
122+
throw new Error('Invalid repository URL');
123+
}
124+
const clonePath = `./${repoName}`; // Directory where the repository will be cloned
125+
126+
// Clone the repository
127+
await cloneRepository(repoUrl, clonePath);
128+
129+
// Scrape the cloned repository directory
130+
const ignorePatterns = ['.git'];
131+
const result = scrapeDirectoryToJson(clonePath, ignorePatterns);
132+
133+
// Clean up the cloned repository
134+
fs.rmdirSync(clonePath, { recursive: true });
135+
console.log('Cloned repository directory removed');
136+
137+
return result;
138+
}
139+
140+
export async function scrapeRepositoryToPlainText(repoUrl: string): Promise<string> {
141+
const repoName = repoUrl.split('/').pop()?.replace('.git', '');
142+
if (!repoName) {
143+
throw new Error('Invalid repository URL');
144+
}
145+
const clonePath = `./${repoName}`; // Directory where the repository will be cloned
146+
147+
// Clone the repository
148+
await cloneRepository(repoUrl, clonePath);
149+
150+
// Scrape the cloned repository directory
151+
const ignorePatterns = ['.git'];
152+
const result = scrapeDirectoryToPlainText(clonePath, ignorePatterns);
153+
154+
// Clean up the cloned repository
155+
fs.rmdirSync(clonePath, { recursive: true });
156+
console.log('Cloned repository directory removed');
157+
158+
return result;
159+
}

0 commit comments

Comments
 (0)