Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement: Consolidate media ingestion function. #529

Open
Tracked by #556
rmusser01 opened this issue Feb 6, 2025 · 0 comments
Open
Tracked by #556

Enhancement: Consolidate media ingestion function. #529

rmusser01 opened this issue Feb 6, 2025 · 0 comments
Assignees
Labels
bug Something isn't working enhancement New feature or request
Milestone

Comments

@rmusser01
Copy link
Owner

Title.

Currently, add_media_with_keywords and add_media_to_database exist.

They should be one function, to reduce complexity and chances of shooting myself in the foot later down the line.

First stab:

import hashlib
import logging
import time
from datetime import datetime
from urllib.parse import quote

def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, 
                         whisper_model, media_type='video', overwrite=False, db=None):
    # Initialize metrics and logging
    log_counter("add_media_attempt")
    start_time = time.time()
    
    if db is None:
        db = Database()

    try:
        with db.get_connection() as conn:
            cursor = conn.cursor()

            # Process input parameters with validation
            title = info_dict.get('title', 'Untitled')
            author = info_dict.get('uploader', 'Unknown')
            ingestion_date = datetime.now().strftime('%Y-%m-%d')
            
            # Validate media type
            valid_media_types = ['article', 'audio', 'book', 'document', 'mediawiki_article',
                                'mediawiki_dump', 'obsidian_note', 'podcast', 'text', 'video', 'unknown']
            if media_type not in valid_media_types:
                log_counter("add_media_error", labels={"error_type": "InvalidMediaType"})
                raise InputError(f"Invalid media type. Allowed types: {', '.join(valid_media_types)}")

            # Content generation from segments or direct input
            if isinstance(segments, list):
                content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
            elif isinstance(segments, dict):
                content = segments.get('text', '') or segments.get('content', '')
            else:
                content = str(segments)

            # Generate content hash for duplicate detection
            content_hash = hashlib.sha256(content.encode()).hexdigest()

            # URL generation logic
            if not url:
                url = f"https://No-URL-Submitted.com/{media_type}/{content_hash}"

            # Keyword processing with batch operations
            if isinstance(keywords, str):
                keyword_list = [k.strip().lower() for k in keywords.split(',')]
            elif isinstance(keywords, (list, tuple)):
                keyword_list = [k.strip().lower() for k in keywords]
            else:
                keyword_list = ['default']

            # Enhanced duplicate detection
            cursor.execute('''
                SELECT id, url, content_hash FROM Media 
                WHERE url = ? OR content_hash = ?
            ''', (url, content_hash))
            existing_media = cursor.fetchone()

            # Media existence handling
            if existing_media:
                media_id = existing_media[0]
                log_counter("add_media_update" if overwrite else "add_media_skip")
                
                if overwrite:
                    logging.info(f"Updating existing media ID {media_id}")
                    cursor.execute('''
                        UPDATE Media SET
                            url = ?,
                            title = ?,
                            type = ?,
                            content = ?,
                            author = ?,
                            ingestion_date = ?,
                            transcription_model = ?,
                            chunking_status = ?,
                            content_hash = ?
                        WHERE id = ?
                    ''', (url, title, media_type, content, author, ingestion_date,
                          whisper_model, 'pending', content_hash, media_id))
            else:
                cursor.execute('''
                    INSERT INTO Media (
                        url, title, type, content, author, 
                        ingestion_date, transcription_model, 
                        chunking_status, content_hash
                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (url, title, media_type, content, author,
                      ingestion_date, whisper_model, 'pending', content_hash))
                media_id = cursor.lastrowid
                log_counter("add_media_insert")

            # Version control system
            cursor.execute('''
                INSERT INTO MediaModifications 
                (media_id, prompt, summary, modification_date)
                VALUES (?, ?, ?, ?)
            ''', (media_id, custom_prompt_input, summary, ingestion_date))

            # Batch keyword processing
            if keyword_list:
                keyword_params = [(k,) for k in keyword_list]
                cursor.executemany('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', keyword_params)
                
                cursor.execute(f'''
                    SELECT id FROM Keywords 
                    WHERE keyword IN ({','.join(['?']*len(keyword_list))})
                ''', keyword_list)
                keyword_ids = [row[0] for row in cursor.fetchall()]
                
                media_keyword_params = [(media_id, kid) for kid in keyword_ids]
                cursor.executemany('''
                    INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) 
                    VALUES (?, ?)
                ''', media_keyword_params)

            # Full-text search update
            cursor.execute('''
                INSERT OR REPLACE INTO media_fts (rowid, title, content)
                VALUES (?, ?, ?)
            ''', (media_id, title, content))

            # Version tracking
            cursor.execute('''
                INSERT INTO MediaVersion 
                (media_id, version, prompt, summary, created_at)
                VALUES (?, COALESCE((SELECT MAX(version) FROM MediaVersion WHERE media_id = ?), 0) + 1, ?, ?, ?)
            ''', (media_id, media_id, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

            conn.commit()

        # Post-commit operations
        schedule_chunking(media_id, content, title)
        
        # Metrics and logging
        duration = time.time() - start_time
        log_histogram("add_media_duration", duration)
        log_counter("add_media_success")
        
        return {
            "media_id": media_id,
            "action": "updated" if existing_media and overwrite else "added",
            "url": url,
            "content_hash": content_hash,
            "keywords": keyword_list
        }

    except DatabaseError as e:
        logging.error(f"Database error: {str(e)}")
        log_counter("add_media_error", labels={"error_type": "DatabaseError"})
        raise
    except Exception as e:
        logging.error(f"Unexpected error: {str(e)}")
        log_counter("add_media_error", labels={"error_type": type(e).__name__})
        raise DatabaseError(f"Unexpected error: {e}")



# Example Usage
result = add_media_to_database(
    url="https://example.com/video123",
    info_dict={
        'title': 'My Video',
        'uploader': 'John Doe'
    },
    segments=[{'Text': 'This is a transcript...'}],
    summary="A summary of the content",
    keywords=["technology", "AI"],
    custom_prompt_input="Generate detailed analysis",
    whisper_model="large-v2",
    media_type="video"
)

print(f"Media ID {result['media_id']} {result['action']} successfully")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working enhancement New feature or request
Projects
None yet
Development

No branches or pull requests

1 participant