We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Title.
Currently, add_media_with_keywords and add_media_to_database exist.
add_media_with_keywords
add_media_to_database
They should be one function, to reduce complexity and chances of shooting myself in the foot later down the line.
First stab:
import hashlib import logging import time from datetime import datetime from urllib.parse import quote def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video', overwrite=False, db=None): # Initialize metrics and logging log_counter("add_media_attempt") start_time = time.time() if db is None: db = Database() try: with db.get_connection() as conn: cursor = conn.cursor() # Process input parameters with validation title = info_dict.get('title', 'Untitled') author = info_dict.get('uploader', 'Unknown') ingestion_date = datetime.now().strftime('%Y-%m-%d') # Validate media type valid_media_types = ['article', 'audio', 'book', 'document', 'mediawiki_article', 'mediawiki_dump', 'obsidian_note', 'podcast', 'text', 'video', 'unknown'] if media_type not in valid_media_types: log_counter("add_media_error", labels={"error_type": "InvalidMediaType"}) raise InputError(f"Invalid media type. Allowed types: {', '.join(valid_media_types)}") # Content generation from segments or direct input if isinstance(segments, list): content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment]) elif isinstance(segments, dict): content = segments.get('text', '') or segments.get('content', '') else: content = str(segments) # Generate content hash for duplicate detection content_hash = hashlib.sha256(content.encode()).hexdigest() # URL generation logic if not url: url = f"https://No-URL-Submitted.com/{media_type}/{content_hash}" # Keyword processing with batch operations if isinstance(keywords, str): keyword_list = [k.strip().lower() for k in keywords.split(',')] elif isinstance(keywords, (list, tuple)): keyword_list = [k.strip().lower() for k in keywords] else: keyword_list = ['default'] # Enhanced duplicate detection cursor.execute(''' SELECT id, url, content_hash FROM Media WHERE url = ? OR content_hash = ? ''', (url, content_hash)) existing_media = cursor.fetchone() # Media existence handling if existing_media: media_id = existing_media[0] log_counter("add_media_update" if overwrite else "add_media_skip") if overwrite: logging.info(f"Updating existing media ID {media_id}") cursor.execute(''' UPDATE Media SET url = ?, title = ?, type = ?, content = ?, author = ?, ingestion_date = ?, transcription_model = ?, chunking_status = ?, content_hash = ? WHERE id = ? ''', (url, title, media_type, content, author, ingestion_date, whisper_model, 'pending', content_hash, media_id)) else: cursor.execute(''' INSERT INTO Media ( url, title, type, content, author, ingestion_date, transcription_model, chunking_status, content_hash ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ''', (url, title, media_type, content, author, ingestion_date, whisper_model, 'pending', content_hash)) media_id = cursor.lastrowid log_counter("add_media_insert") # Version control system cursor.execute(''' INSERT INTO MediaModifications (media_id, prompt, summary, modification_date) VALUES (?, ?, ?, ?) ''', (media_id, custom_prompt_input, summary, ingestion_date)) # Batch keyword processing if keyword_list: keyword_params = [(k,) for k in keyword_list] cursor.executemany('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', keyword_params) cursor.execute(f''' SELECT id FROM Keywords WHERE keyword IN ({','.join(['?']*len(keyword_list))}) ''', keyword_list) keyword_ids = [row[0] for row in cursor.fetchall()] media_keyword_params = [(media_id, kid) for kid in keyword_ids] cursor.executemany(''' INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?) ''', media_keyword_params) # Full-text search update cursor.execute(''' INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?) ''', (media_id, title, content)) # Version tracking cursor.execute(''' INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at) VALUES (?, COALESCE((SELECT MAX(version) FROM MediaVersion WHERE media_id = ?), 0) + 1, ?, ?, ?) ''', (media_id, media_id, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) conn.commit() # Post-commit operations schedule_chunking(media_id, content, title) # Metrics and logging duration = time.time() - start_time log_histogram("add_media_duration", duration) log_counter("add_media_success") return { "media_id": media_id, "action": "updated" if existing_media and overwrite else "added", "url": url, "content_hash": content_hash, "keywords": keyword_list } except DatabaseError as e: logging.error(f"Database error: {str(e)}") log_counter("add_media_error", labels={"error_type": "DatabaseError"}) raise except Exception as e: logging.error(f"Unexpected error: {str(e)}") log_counter("add_media_error", labels={"error_type": type(e).__name__}) raise DatabaseError(f"Unexpected error: {e}") # Example Usage result = add_media_to_database( url="https://example.com/video123", info_dict={ 'title': 'My Video', 'uploader': 'John Doe' }, segments=[{'Text': 'This is a transcript...'}], summary="A summary of the content", keywords=["technology", "AI"], custom_prompt_input="Generate detailed analysis", whisper_model="large-v2", media_type="video" ) print(f"Media ID {result['media_id']} {result['action']} successfully")
The text was updated successfully, but these errors were encountered:
rmusser01
No branches or pull requests
Title.
Currently,
add_media_with_keywords
andadd_media_to_database
exist.They should be one function, to reduce complexity and chances of shooting myself in the foot later down the line.
First stab:
The text was updated successfully, but these errors were encountered: