-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_content.py
96 lines (77 loc) · 3.27 KB
/
update_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Script to update existing pages with titles and summaries.
"""
import os
import asyncio
import argparse
from typing import List, Dict, Any
from dotenv import load_dotenv
from tqdm import tqdm
from db_client import SupabaseClient
from content_enhancer import ContentEnhancer
# Load environment variables
load_dotenv()
async def update_pages(site_id: int = None, limit: int = 100, force: bool = False):
"""Update existing pages with titles and summaries.
Args:
site_id: Optional site ID to filter by. If not provided, all sites will be updated.
limit: Maximum number of pages to update per site.
force: Whether to force update pages that already have titles and summaries.
"""
db_client = SupabaseClient()
content_enhancer = ContentEnhancer()
# Get sites to update
if site_id:
# Get a specific site
conn = db_client._get_connection()
cur = conn.cursor()
cur.execute("SELECT id, name, url FROM crawl_sites WHERE id = %s", (site_id,))
sites = cur.fetchall()
conn.close()
else:
# Get all sites
conn = db_client._get_connection()
cur = conn.cursor()
cur.execute("SELECT id, name, url FROM crawl_sites ORDER BY id")
sites = cur.fetchall()
conn.close()
if not sites:
print("No sites found.")
return
print(f"Found {len(sites)} site(s) to update.")
for site in sites:
site_id, site_name, site_url = site
print(f"\nUpdating site: {site_name} (ID: {site_id})")
# Get pages for this site
pages = db_client.get_pages_by_site_id(site_id, limit)
if not pages:
print("No pages found for this site.")
continue
# Filter pages that need updating
if not force:
pages_to_update = [page for page in pages if not page.get('title') or not page.get('summary')]
if not pages_to_update:
print("All pages already have titles and summaries.")
continue
print(f"Found {len(pages_to_update)} page(s) that need updating.")
else:
pages_to_update = pages
print(f"Forcing update of {len(pages_to_update)} page(s).")
# Update pages with titles and summaries
print("Generating titles and summaries...")
enhanced_pages = await content_enhancer.enhance_pages_async(pages_to_update)
# Update the database
print("Updating database...")
page_ids = db_client.add_pages(site_id, enhanced_pages)
print(f"Successfully updated {len(page_ids)} page(s).")
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(description="Update existing pages with titles and summaries")
parser.add_argument("--site-id", type=int, help="Site ID to update (optional)")
parser.add_argument("--limit", type=int, default=100, help="Maximum number of pages to update per site")
parser.add_argument("--force", action="store_true", help="Force update pages that already have titles and summaries")
args = parser.parse_args()
# Run the update
asyncio.run(update_pages(args.site_id, args.limit, args.force))
if __name__ == "__main__":
main()