Enhancement: Consolidate media ingestion function. #529

rmusser01 · 2025-02-06T22:42:05Z

Title.

Currently, add_media_with_keywords and add_media_to_database exist.

They should be one function, to reduce complexity and chances of shooting myself in the foot later down the line.

First stab:

import hashlib
import logging
import time
from datetime import datetime
from urllib.parse import quote

def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, 
                         whisper_model, media_type='video', overwrite=False, db=None):
    # Initialize metrics and logging
    log_counter("add_media_attempt")
    start_time = time.time()
    
    if db is None:
        db = Database()

    try:
        with db.get_connection() as conn:
            cursor = conn.cursor()

            # Process input parameters with validation
            title = info_dict.get('title', 'Untitled')
            author = info_dict.get('uploader', 'Unknown')
            ingestion_date = datetime.now().strftime('%Y-%m-%d')
            
            # Validate media type
            valid_media_types = ['article', 'audio', 'book', 'document', 'mediawiki_article',
                                'mediawiki_dump', 'obsidian_note', 'podcast', 'text', 'video', 'unknown']
            if media_type not in valid_media_types:
                log_counter("add_media_error", labels={"error_type": "InvalidMediaType"})
                raise InputError(f"Invalid media type. Allowed types: {', '.join(valid_media_types)}")

            # Content generation from segments or direct input
            if isinstance(segments, list):
                content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
            elif isinstance(segments, dict):
                content = segments.get('text', '') or segments.get('content', '')
            else:
                content = str(segments)

            # Generate content hash for duplicate detection
            content_hash = hashlib.sha256(content.encode()).hexdigest()

            # URL generation logic
            if not url:
                url = f"https://No-URL-Submitted.com/{media_type}/{content_hash}"

            # Keyword processing with batch operations
            if isinstance(keywords, str):
                keyword_list = [k.strip().lower() for k in keywords.split(',')]
            elif isinstance(keywords, (list, tuple)):
                keyword_list = [k.strip().lower() for k in keywords]
            else:
                keyword_list = ['default']

            # Enhanced duplicate detection
            cursor.execute('''
                SELECT id, url, content_hash FROM Media 
                WHERE url = ? OR content_hash = ?
            ''', (url, content_hash))
            existing_media = cursor.fetchone()

            # Media existence handling
            if existing_media:
                media_id = existing_media[0]
                log_counter("add_media_update" if overwrite else "add_media_skip")
                
                if overwrite:
                    logging.info(f"Updating existing media ID {media_id}")
                    cursor.execute('''
                        UPDATE Media SET
                            url = ?,
                            title = ?,
                            type = ?,
                            content = ?,
                            author = ?,
                            ingestion_date = ?,
                            transcription_model = ?,
                            chunking_status = ?,
                            content_hash = ?
                        WHERE id = ?
                    ''', (url, title, media_type, content, author, ingestion_date,
                          whisper_model, 'pending', content_hash, media_id))
            else:
                cursor.execute('''
                    INSERT INTO Media (
                        url, title, type, content, author, 
                        ingestion_date, transcription_model, 
                        chunking_status, content_hash
                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (url, title, media_type, content, author,
                      ingestion_date, whisper_model, 'pending', content_hash))
                media_id = cursor.lastrowid
                log_counter("add_media_insert")

            # Version control system
            cursor.execute('''
                INSERT INTO MediaModifications 
                (media_id, prompt, summary, modification_date)
                VALUES (?, ?, ?, ?)
            ''', (media_id, custom_prompt_input, summary, ingestion_date))

            # Batch keyword processing
            if keyword_list:
                keyword_params = [(k,) for k in keyword_list]
                cursor.executemany('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', keyword_params)
                
                cursor.execute(f'''
                    SELECT id FROM Keywords 
                    WHERE keyword IN ({','.join(['?']*len(keyword_list))})
                ''', keyword_list)
                keyword_ids = [row[0] for row in cursor.fetchall()]
                
                media_keyword_params = [(media_id, kid) for kid in keyword_ids]
                cursor.executemany('''
                    INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) 
                    VALUES (?, ?)
                ''', media_keyword_params)

            # Full-text search update
            cursor.execute('''
                INSERT OR REPLACE INTO media_fts (rowid, title, content)
                VALUES (?, ?, ?)
            ''', (media_id, title, content))

            # Version tracking
            cursor.execute('''
                INSERT INTO MediaVersion 
                (media_id, version, prompt, summary, created_at)
                VALUES (?, COALESCE((SELECT MAX(version) FROM MediaVersion WHERE media_id = ?), 0) + 1, ?, ?, ?)
            ''', (media_id, media_id, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

            conn.commit()

        # Post-commit operations
        schedule_chunking(media_id, content, title)
        
        # Metrics and logging
        duration = time.time() - start_time
        log_histogram("add_media_duration", duration)
        log_counter("add_media_success")
        
        return {
            "media_id": media_id,
            "action": "updated" if existing_media and overwrite else "added",
            "url": url,
            "content_hash": content_hash,
            "keywords": keyword_list
        }

    except DatabaseError as e:
        logging.error(f"Database error: {str(e)}")
        log_counter("add_media_error", labels={"error_type": "DatabaseError"})
        raise
    except Exception as e:
        logging.error(f"Unexpected error: {str(e)}")
        log_counter("add_media_error", labels={"error_type": type(e).__name__})
        raise DatabaseError(f"Unexpected error: {e}")



# Example Usage
result = add_media_to_database(
    url="https://example.com/video123",
    info_dict={
        'title': 'My Video',
        'uploader': 'John Doe'
    },
    segments=[{'Text': 'This is a transcript...'}],
    summary="A summary of the content",
    keywords=["technology", "AI"],
    custom_prompt_input="Generate detailed analysis",
    whisper_model="large-v2",
    media_type="video"
)

print(f"Media ID {result['media_id']} {result['action']} successfully")

The text was updated successfully, but these errors were encountered:

rmusser01 added bug Something isn't working enhancement New feature or request labels Feb 6, 2025

rmusser01 added this to the Beta v10 milestone Feb 6, 2025

rmusser01 self-assigned this Feb 6, 2025

This was referenced Feb 7, 2025

Update Backup_Config.txt & Fix Cohere Summarization #536

Merged

Max response tokens setting, bugfix for custom openAI API endpoint #539

Merged

Fix for llama.cpp summarization #542

Merged

This was referenced Feb 14, 2025

Update Utils.py #548

Merged

Fix ollama + loosen validity check for printable characters in summary results #549

Merged

Update Utils.py #553

Merged

Links + More Progress #556

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Enhancement: Consolidate media ingestion function. #529

Enhancement: Consolidate media ingestion function. #529

rmusser01 commented Feb 6, 2025

Enhancement: Consolidate media ingestion function. #529

Enhancement: Consolidate media ingestion function. #529

Comments

rmusser01 commented Feb 6, 2025