· 6 months ago · Mar 13, 2025, 04:50 PM
1# ═════════════════════════════════════════════════════════════════════════════
2# ═══════════════════════════ ENHANCEMENT OVERVIEW ═══════════════════════════
3# ═════════════════════════════════════════════════════════════════════════════
4"""
5SkypeExporter Enhancements:
6
71. Basic Mode:
8 - Added simplified procedural workflow
9 - Streamlined user interaction
10 - Direct prompts with clear instructions
11
122. Enhanced Filename Sanitization:
13 - Reserved Windows name handling
14 - Cross-platform compatibility
15 - Length limits enforcement
16 - Special character handling
17
183. Memory Profiling & Optimization:
19 - Memory usage tracking
20 - Dynamic batch size adjustment
21 - Automated garbage collection
22 - System-aware resource allocation
23
244. PostgreSQL Export:
25 - Normalized database schema
26 - SQLAlchemy ORM integration
27 - Batch insertion optimization
28 - Connection pooling and management
29 - Configurable database settings
30"""
31
32# ═════════════════════════════════════════════════════════════════════════════
33# ═══════════════════════════ IMPORTS AND SETUP ═══════════════════════════════
34# ═════════════════════════════════════════════════════════════════════════════
35
36import argparse
37import asyncio
38import concurrent.futures
39import dataclasses
40import datetime
41import fnmatch
42import gc
43import html
44import importlib.metadata
45import json
46import logging
47import os
48import platform
49import psutil
50import re
51import shutil
52import signal
53import sys
54import tarfile
55import tempfile
56import time
57import traceback
58import uuid
59import zipfile
60from abc import ABC, abstractmethod
61from contextlib import contextmanager
62from dataclasses import dataclass, field
63from enum import Enum, auto
64from pathlib import Path
65from typing import (Any, Dict, Generator, List, Optional, Set, Tuple)
66
67# Import for SQLAlchemy
68try:
69 from sqlalchemy import (
70 Column, ForeignKey, Integer, String, DateTime, Boolean, Text, create_engine,
71 select, func, Index, UniqueConstraint
72 )
73 from sqlalchemy.orm import relationship, Session, sessionmaker, declarative_base
74 from sqlalchemy.ext.declarative import declared_attr
75 SQLALCHEMY_AVAILABLE = True
76except ImportError:
77 SQLALCHEMY_AVAILABLE = False
78
79# Import for Rich and other optional libraries
80try:
81 from rich.console import Console
82 from rich.progress import Progress, TextColumn, BarColumn, TimeElapsedColumn, TimeRemainingColumn
83 from rich.table import Table
84 from rich.panel import Panel
85 from rich.markdown import Markdown
86 RICH_AVAILABLE = True
87except ImportError:
88 RICH_AVAILABLE = False
89
90try:
91 from tqdm import tqdm
92 TQDM_AVAILABLE = True
93except ImportError:
94 TQDM_AVAILABLE = False
95
96# ═════════════════════════════════════════════════════════════════════════════
97# ═══════════════════════════ CUSTOM EXCEPTIONS ══════════════════════════════
98# ═════════════════════════════════════════════════════════════════════════════
99
100class SkypeExporterError(Exception):
101 """Base exception for all Skype Exporter errors."""
102 pass
103
104class ConfigError(SkypeExporterError):
105 """Error in configuration settings."""
106 pass
107
108class FileReadError(SkypeExporterError):
109 """Error reading input files."""
110 pass
111
112class FileWriteError(SkypeExporterError):
113 """Error writing output files."""
114 pass
115
116class ParseError(SkypeExporterError):
117 """Error parsing Skype data."""
118 pass
119
120class TimestampError(ParseError):
121 """Error parsing timestamps."""
122 pass
123
124class ExportError(SkypeExporterError):
125 """Error exporting conversations."""
126 pass
127
128class DatabaseError(SkypeExporterError):
129 """Error with database operations."""
130 pass
131
132class MemoryError(SkypeExporterError):
133 """Error with memory management."""
134 pass
135
136# ═════════════════════════════════════════════════════════════════════════════
137# ═══════════════════════════ DEPENDENCY MANAGEMENT ═══════════════════════════
138# ═════════════════════════════════════════════════════════════════════════════
139
140REQUIRED_PACKAGES = {
141 "beautifulsoup4": "4.9.0",
142 "lxml": "4.5.0",
143 "colorama": "0.4.3",
144 "tqdm": "4.45.0",
145 "rich": "10.0.0",
146 "jinja2": "3.0.0",
147 "markdown": "3.3.0",
148 "pyyaml": "6.0.0",
149 "psutil": "5.8.0", # Added for memory monitoring
150 "sqlalchemy": "1.4.0", # Added for PostgreSQL export
151 "psycopg2-binary": "2.9.0", # Added for PostgreSQL connection
152 "alembic": "1.7.0", # Added for database migrations
153}
154
155def check_dependencies() -> Dict[str, bool]:
156 """
157 Check if required dependencies are installed and at the correct version.
158
159 Returns:
160 Dict[str, bool]: Dictionary of package names and whether they're properly installed
161 """
162 result = {}
163
164 for package, min_version in REQUIRED_PACKAGES.items():
165 try:
166 installed_version = importlib.metadata.version(package)
167 version_ok = _compare_versions(installed_version, min_version) >= 0
168 result[package] = version_ok
169 except importlib.metadata.PackageNotFoundError:
170 result[package] = False
171
172 return result
173
174def _compare_versions(version1: str, version2: str) -> int:
175 """
176 Compare two version strings.
177
178 Args:
179 version1: First version string
180 version2: Second version string
181
182 Returns:
183 int: 1 if version1 > version2, 0 if equal, -1 if version1 < version2
184 """
185 def normalize(v):
186 return [int(x) for x in re.sub(r'(\.0+)*$', '', v).split(".")]
187
188 v1 = normalize(version1)
189 v2 = normalize(version2)
190
191 for i in range(max(len(v1), len(v2))):
192 n1 = v1[i] if i < len(v1) else 0
193 n2 = v2[i] if i < len(v2) else 0
194 if n1 > n2:
195 return 1
196 elif n1 < n2:
197 return -1
198
199 return 0
200
201def install_dependencies() -> None:
202 """
203 Check for missing dependencies and provide installation instructions.
204
205 Instead of automatically installing packages, this now warns the user
206 and provides instructions for manual installation.
207 """
208 dependencies = check_dependencies()
209 missing = []
210
211 for dep, installed in dependencies.items():
212 if not installed:
213 missing.append(dep)
214
215 if missing:
216 print("\nWARNING: The following dependencies are missing:")
217 for dep in missing:
218 print(f" - {dep}")
219
220 print("\nPlease install them manually with:")
221 print(f" pip install {' '.join(missing)}")
222 print("\nContinuing with limited functionality. Some features may not work correctly.")
223 else:
224 print("All dependencies are installed.")
225
226# Import optional dependencies, which may fail
227try:
228 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
229 import warnings
230 warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
231 BEAUTIFULSOUP_AVAILABLE = True
232except ImportError:
233 BEAUTIFULSOUP_AVAILABLE = False
234
235try:
236 from rich.console import Console
237 from rich.progress import Progress, TextColumn, BarColumn, TimeElapsedColumn, TimeRemainingColumn
238 from rich.panel import Panel
239 from rich.table import Table
240 from rich.syntax import Syntax
241 from rich.logging import RichHandler
242 from rich.traceback import install as install_rich_traceback
243 from rich.prompt import Prompt, Confirm
244 RICH_AVAILABLE = True
245 install_rich_traceback()
246except ImportError:
247 RICH_AVAILABLE = False
248
249try:
250 from colorama import init as colorama_init
251 from colorama import Fore, Back, Style
252 COLORAMA_AVAILABLE = True
253 colorama_init()
254except ImportError:
255 COLORAMA_AVAILABLE = False
256
257try:
258 from tqdm import tqdm
259 TQDM_AVAILABLE = True
260except ImportError:
261 TQDM_AVAILABLE = False
262
263try:
264 import markdown
265 MARKDOWN_AVAILABLE = True
266except ImportError:
267 MARKDOWN_AVAILABLE = False
268
269try:
270 import jinja2
271 JINJA2_AVAILABLE = True
272except ImportError:
273 JINJA2_AVAILABLE = False
274
275try:
276 import yaml
277 YAML_AVAILABLE = True
278except ImportError:
279 YAML_AVAILABLE = False
280
281try:
282 import sqlalchemy
283 from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, Boolean, ForeignKey
284 from sqlalchemy.orm import sessionmaker, relationship, Session, declarative_base
285 SQLALCHEMY_AVAILABLE = True
286except ImportError:
287 SQLALCHEMY_AVAILABLE = False
288
289try:
290 import psycopg2
291 PSYCOPG2_AVAILABLE = True
292except ImportError:
293 PSYCOPG2_AVAILABLE = False
294
295# ═════════════════════════════════════════════════════════════════════════════
296# ═══════════════════════════ MEMORY MANAGEMENT ═══════════════════════════════
297# ═════════════════════════════════════════════════════════════════════════════
298
299class MemoryMonitor:
300 """
301 Monitor and manage memory usage during processing.
302
303 This class provides utilities to track memory usage, optimize batch sizes,
304 and trigger garbage collection based on system resources.
305 """
306
307 def __init__(self, ctx: 'AppContext'):
308 """Initialize memory monitor with application context."""
309 self.ctx = ctx
310 self.logger = ctx.logger.getChild('memory')
311 self.process = psutil.Process(os.getpid()) # Initialize the process object
312 self.usage_history = []
313 self.memory_samples = [] # Restore memory samples list
314 self.memory_timestamps = [] # Restore timestamps list
315 self.peak_usage = 0
316 self.last_memory_percent = None
317 self.last_gc_time = time.time()
318 self.memory_target = ctx.options.memory_threshold_percent
319 self.check_counter = 0 # Counter for adaptive memory checks
320
321 # Capture initial memory usage
322 self.record_memory_usage()
323
324 self.logger.debug(f"Memory monitor initialized with target: {self.memory_target}%")
325
326 def get_memory_usage_mb(self) -> float:
327 """
328 Get current memory usage in megabytes.
329
330 Returns:
331 Memory usage in MB
332 """
333 return self.process.memory_info().rss / (1024 * 1024)
334
335 def get_memory_percent(self) -> float:
336 """
337 Get memory usage as percentage of system memory.
338
339 Returns:
340 Memory usage percentage
341 """
342 return self.process.memory_percent()
343
344 def get_system_memory_mb(self) -> float:
345 """
346 Get total system memory in megabytes.
347
348 Returns:
349 Total system memory in MB
350 """
351 return psutil.virtual_memory().total / (1024 * 1024)
352
353 def record_memory_usage(self) -> None:
354 """Record current memory usage for tracking."""
355 current_usage_mb = self.get_memory_usage_mb()
356 current_time = time.time()
357
358 # Record in both tracking mechanisms for backward compatibility
359 self.usage_history.append(current_usage_mb)
360 self.memory_samples.append(current_usage_mb)
361 self.memory_timestamps.append(current_time)
362
363 # Keep only the last 100 samples in both arrays
364 if len(self.memory_samples) > 100:
365 self.memory_samples.pop(0)
366 self.memory_timestamps.pop(0)
367
368 if len(self.usage_history) > 100:
369 self.usage_history.pop(0)
370
371 def check_memory(self) -> bool:
372 """
373 Check memory usage and optimize if needed.
374
375 Returns:
376 True if optimization was performed, False otherwise
377 """
378 # Use an adaptive check interval based on previous memory usage
379 self.check_counter += 1
380
381 # Default intervals for memory checks (operations between checks)
382 low_usage_interval = 100 # Less frequent checks when memory usage is low
383 medium_usage_interval = 25 # Medium frequency checks
384 high_usage_interval = 5 # Frequent checks when memory is high
385
386 # Determine the check interval based on last measured memory percentage
387 if self.last_memory_percent is None:
388 check_interval = medium_usage_interval
389 elif self.last_memory_percent < 30:
390 check_interval = low_usage_interval
391 elif self.last_memory_percent < 60:
392 check_interval = medium_usage_interval
393 else:
394 check_interval = high_usage_interval
395
396 # Skip check if we haven't reached the interval, unless it's the first check
397 if self.check_counter % check_interval != 0 and self.last_memory_percent is not None:
398 return False
399
400 # Get current memory usage
401 memory_percent = self.get_memory_percent()
402 memory_usage_mb = self.get_memory_usage_mb()
403 self.last_memory_percent = memory_percent
404
405 # Record usage for historical tracking
406 self.record_memory_usage()
407
408 if memory_usage_mb > self.peak_usage:
409 self.peak_usage = memory_usage_mb
410
411 # Check if memory usage exceeds threshold
412 if memory_percent > self.memory_target:
413 self.logger.warning(
414 f"Memory usage high: {memory_percent:.1f}% ({memory_usage_mb:.1f} MB), "
415 f"optimizing..."
416 )
417 self._optimize_memory()
418 return True
419
420 # Occasionally collect garbage even if memory usage is low
421 # but at a lower frequency (every 5000 operations or 60 seconds)
422 elif (self.check_counter % 5000 == 0 or
423 (time.time() - self.last_gc_time > 60)):
424 self.logger.debug(
425 f"Performing routine garbage collection: {memory_percent:.1f}% "
426 f"({memory_usage_mb:.1f} MB)"
427 )
428 self._collect_garbage()
429
430 self.logger.debug(
431 f"Memory usage: {memory_percent:.1f}% ({memory_usage_mb:.1f} MB) "
432 f"of {self.get_system_memory_mb():.1f} MB"
433 )
434
435 return False
436
437 def _optimize_memory(self) -> None:
438 """Optimize memory usage by adjusting batch sizes and collecting garbage."""
439 self.logger.info("Optimizing memory usage...")
440
441 # Reduce batch size to conserve memory
442 current_batch_size = self.ctx.options.batch_size
443 new_batch_size = max(100, current_batch_size // 2)
444
445 if new_batch_size < current_batch_size:
446 self.logger.info(f"Reducing batch size from {current_batch_size} to {new_batch_size}")
447 self.ctx.options.batch_size = new_batch_size
448
449 # Reduce max workers if memory usage is very high
450 if self.get_memory_percent() > 90 and self.ctx.options.max_workers > 2:
451 self.logger.warning("Critical memory usage - reducing worker threads")
452 self.ctx.options.max_workers = max(1, self.ctx.options.max_workers // 2)
453
454 # Force garbage collection
455 self._collect_garbage()
456
457 def _collect_garbage(self) -> None:
458 """Force garbage collection to free memory."""
459 self.logger.debug("Running garbage collection...")
460
461 before_mb = self.get_memory_usage_mb()
462 gc.collect()
463 after_mb = self.get_memory_usage_mb()
464
465 freed_mb = before_mb - after_mb
466 self.logger.debug(f"Garbage collection freed {freed_mb:.2f} MB")
467
468 def calculate_optimal_batch_size(self, item_count: int) -> int:
469 """
470 Calculate optimal batch size based on available system resources.
471
472 Args:
473 item_count: Total number of items to process
474
475 Returns:
476 Optimal batch size
477 """
478 # Get available memory in MB
479 available_memory = psutil.virtual_memory().available / (1024 * 1024)
480
481 # Estimate memory per item (using exponential moving average if we have samples)
482 current_memory = self.get_memory_usage_mb()
483 memory_per_item = 0.1 # Default assumption: 100KB per item
484
485 # Calculate optimal batch size - aim to use at most 20% of available memory
486 max_memory_to_use = available_memory * 0.2
487 optimal_batch_size = int(max_memory_to_use / memory_per_item)
488
489 # Constrain within reasonable limits
490 optimal_batch_size = min(optimal_batch_size, 5000) # Never go above 5000
491 optimal_batch_size = max(optimal_batch_size, 100) # Never go below 100
492
493 # Round to nearest 100 for cleaner numbers
494 optimal_batch_size = round(optimal_batch_size / 100) * 100
495
496 self.logger.debug(f"Calculated optimal batch size: {optimal_batch_size} "
497 f"(available memory: {available_memory:.2f} MB)")
498
499 return optimal_batch_size
500
501 def get_memory_report(self) -> Dict[str, Any]:
502 """
503 Generate a report on memory usage.
504
505 Returns:
506 Dictionary with memory statistics
507 """
508 return {
509 "current_usage_mb": self.get_memory_usage_mb(),
510 "current_usage_percent": self.get_memory_percent(),
511 "peak_usage_mb": max(self.memory_samples) if self.memory_samples else self.get_memory_usage_mb(),
512 "system_memory_mb": self.get_system_memory_mb(),
513 "batch_size": self.ctx.options.batch_size,
514 "max_workers": self.ctx.options.max_workers
515 }
516
517# ═════════════════════════════════════════════════════════════════════════════
518# ═══════════════════════════ FILEPATH UTILITIES ═══════════════════════════════
519# ═════════════════════════════════════════════════════════════════════════════
520
521def sanitize_filename(name: str, max_length: int = 200) -> str:
522 """
523 Sanitize a string to be used as a filename across all platforms.
524
525 Handles invalid characters, reserved Windows names, and length limitations.
526
527 Args:
528 name: Original name to sanitize
529 max_length: Maximum length for the filename
530
531 Returns:
532 Sanitized filename string safe for all platforms
533 """
534 if not name:
535 return "unnamed"
536
537 # Handle file system restrictions
538 # 1. Replace invalid characters
539 sanitized = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', name)
540
541 # 2. Check for reserved Windows names (CON, PRN, AUX, etc.)
542 reserved_names = {
543 'CON', 'PRN', 'AUX', 'NUL',
544 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
545 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
546 }
547
548 # Check if name matches a reserved name (either exactly or with an extension)
549 name_parts = sanitized.split('.')
550 if name_parts[0].upper() in reserved_names:
551 sanitized = f"_{sanitized}"
552
553 # 3. Enforce length limit with smart truncation
554 if len(sanitized) > max_length:
555 # Keep the extension if present
556 if '.' in sanitized:
557 extension = '.' + sanitized.split('.')[-1]
558 base_name = '.'.join(sanitized.split('.')[:-1])
559
560 # Truncate the base name, leaving room for ellipsis and extension
561 available_length = max_length - len(extension) - 3 # 3 for "..."
562 sanitized = base_name[:available_length] + "..." + extension
563 else:
564 sanitized = sanitized[:max_length-3] + "..."
565
566 # 4. Ensure name doesn't end with space or period (Windows restriction)
567 sanitized = sanitized.rstrip(' .')
568
569 # 5. If empty after sanitization, provide a fallback
570 if not sanitized:
571 sanitized = "unnamed_file"
572
573 return sanitized
574
575def ensure_directory(path: Path) -> Path:
576 """
577 Ensure a directory exists, creating it if necessary.
578
579 Args:
580 path: Directory path to ensure
581
582 Returns:
583 Path to the directory
584 """
585 path.mkdir(parents=True, exist_ok=True)
586 return path
587
588def get_unique_filename(directory: Path, base_name: str, extension: str) -> Path:
589 """
590 Generate a unique filename by appending a counter if needed.
591
592 Args:
593 directory: Directory path
594 base_name: Base filename
595 extension: File extension
596
597 Returns:
598 Path to a unique filename
599 """
600 # Ensure extension starts with a dot
601 if extension and not extension.startswith('.'):
602 extension = '.' + extension
603
604 # First try the original name
605 file_path = directory / f"{base_name}{extension}"
606 if not file_path.exists():
607 return file_path
608
609 # Add counter until we find an unused name
610 counter = 1
611 while True:
612 file_path = directory / f"{base_name}_{counter}{extension}"
613 if not file_path.exists():
614 return file_path
615 counter += 1
616
617# ═════════════════════════════════════════════════════════════════════════════
618# ═══════════════════════════ CONFIGURATION AND SETUP ═════════════════════════
619# ═════════════════════════════════════════════════════════════════════════════
620
621class LogLevel(Enum):
622 """Log levels with descriptive names."""
623 DEBUG = logging.DEBUG
624 INFO = logging.INFO
625 WARNING = logging.WARNING
626 ERROR = logging.ERROR
627 CRITICAL = logging.CRITICAL
628
629class OutputFormat(Enum):
630 """Supported output formats for exporting conversations."""
631 TEXT = auto()
632 HTML = auto()
633 MARKDOWN = auto()
634 JSON = auto()
635 POSTGRESQL = auto() # Added support for PostgreSQL export
636 ALL = auto()
637
638@dataclass
639class DatabaseConfig:
640 """Configuration for database connections."""
641 engine: str = "postgresql"
642 host: str = "localhost"
643 port: int = 5432
644 database: str = "skype_export"
645 username: str = "postgres"
646 password: str = ""
647 schema: str = "public"
648 connection_pool_size: int = 5
649 connection_max_overflow: int = 10
650 connection_timeout: int = 30
651 echo_sql: bool = False
652
653 @property
654 def connection_string(self) -> str:
655 """Generate SQLAlchemy connection string."""
656 return (f"{self.engine}://{self.username}:{self.password}@"
657 f"{self.host}:{self.port}/{self.database}")
658
659@dataclass
660class ExportOptions:
661 """Configuration options for the export process."""
662 output_dir: Path = Path.cwd() / "skype_exports"
663 format: OutputFormat = OutputFormat.TEXT
664 anonymize: bool = False
665 include_timestamps: bool = True
666 use_local_time: bool = True
667 include_metadata: bool = True
668 include_message_ids: bool = False
669 parallel: bool = True
670 max_workers: int = max(1, os.cpu_count() or 4)
671 batch_size: int = 1000
672 timezone: Optional[str] = None
673 pretty_print: bool = True
674 compress_output: bool = False
675 filter_pattern: Optional[str] = None
676 date_range: Optional[Tuple[datetime.date, datetime.date]] = None
677 include_conversation_stats: bool = True
678 media_links: bool = False
679 strip_html: bool = True
680 debug_mode: bool = False
681 basic_mode: bool = False # Added for basic mode
682 enable_memory_optimization: bool = True # Added for memory optimization
683 memory_profile: bool = False # Added for memory profiling
684 memory_threshold_percent: int = 75 # Added for memory monitoring
685 database_config: DatabaseConfig = field(default_factory=DatabaseConfig)
686
687@dataclass
688class AppContext:
689 """Application context with shared resources and state."""
690 options: ExportOptions = field(default_factory=ExportOptions)
691 logger: logging.Logger = field(default_factory=lambda: logging.getLogger("original_scripts.testing"))
692 console: Any = field(default=None)
693 temp_dir: Optional[Path] = None
694 start_time: float = field(default_factory=time.time)
695 user_id: Optional[str] = None
696 user_display_name: Optional[str] = None
697 export_date: Optional[str] = None
698 export_time: Optional[str] = None
699 total_conversations: int = 0
700 total_messages: int = 0
701 processed_conversations: int = 0
702 processed_messages: int = 0
703 errors: List[Dict[str, Any]] = field(default_factory=list)
704 cancel_requested: bool = False
705 memory_monitor: Optional['MemoryMonitor'] = None
706
707 def __post_init__(self):
708 """Initialize console based on available libraries."""
709 if RICH_AVAILABLE and not self.console:
710 self.console = Console()
711
712 if self.options.enable_memory_optimization:
713 try:
714 self.memory_monitor = MemoryMonitor(self)
715 except Exception as e:
716 self.logger.warning(f"Failed to initialize memory monitor: {e}")
717
718 @property
719 def progress_tracker(self):
720 """Get a progress tracker based on available libraries."""
721 if RICH_AVAILABLE:
722 return Progress(
723 TextColumn("[bold blue]{task.description}"),
724 BarColumn(),
725 "[progress.percentage]{task.percentage:>3.0f}%",
726 TimeElapsedColumn(),
727 TimeRemainingColumn(),
728 console=self.console
729 )
730 elif TQDM_AVAILABLE:
731 return tqdm
732 else:
733 return None # Simple text-based progress will be used
734
735 @contextmanager
736 def create_temp_directory(self) -> Generator[Path, None, None]:
737 """Create and manage a temporary directory for processing."""
738 try:
739 temp_dir = Path(tempfile.mkdtemp(prefix="original_scripts.testing_"))
740 self.temp_dir = temp_dir
741 yield temp_dir
742 finally:
743 if self.temp_dir and self.temp_dir.exists():
744 shutil.rmtree(self.temp_dir, ignore_errors=True)
745 self.temp_dir = None
746
747 def check_memory(self) -> bool:
748 """
749 Check memory usage and optimize if needed.
750
751 Returns:
752 True if optimization was performed, False otherwise
753 """
754 if self.memory_monitor and self.options.enable_memory_optimization:
755 return self.memory_monitor.check_memory()
756 return False
757
758 def get_memory_report(self) -> Optional[Dict[str, Any]]:
759 """
760 Get a report on memory usage.
761
762 Returns:
763 Dictionary with memory statistics or None if monitoring disabled
764 """
765 if self.memory_monitor:
766 return self.memory_monitor.get_memory_report()
767 return None
768
769def setup_logging(level: LogLevel = LogLevel.INFO, log_file: Optional[Path] = None) -> logging.Logger:
770 """
771 Configure logging with rich formatting if available.
772
773 Args:
774 level: Logging level to use
775 log_file: Optional file path to write logs to
776
777 Returns:
778 Configured logger instance
779 """
780 log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
781
782 # Create logger
783 logger = logging.getLogger("original_scripts.testing")
784 logger.setLevel(level.value)
785 logger.handlers = [] # Clear any existing handlers
786
787 # Console handler
788 if RICH_AVAILABLE:
789 console_handler = RichHandler(rich_tracebacks=True)
790 console_handler.setFormatter(logging.Formatter("%(message)s"))
791 else:
792 console_handler = logging.StreamHandler()
793 console_handler.setFormatter(logging.Formatter(log_format))
794
795 console_handler.setLevel(level.value)
796 logger.addHandler(console_handler)
797
798 # File handler if specified
799 if log_file:
800 log_file.parent.mkdir(parents=True, exist_ok=True)
801 file_handler = logging.FileHandler(log_file, encoding='utf-8')
802 file_handler.setFormatter(logging.Formatter(log_format))
803 file_handler.setLevel(level.value)
804 logger.addHandler(file_handler)
805
806 return logger
807
808def get_logger(name: str, ctx: AppContext) -> logging.Logger:
809 """
810 Get a consistently configured logger instance.
811
812 Args:
813 name: Logger name (will be prefixed with original_scripts.testing)
814 ctx: Application context with configuration
815
816 Returns:
817 Configured logger instance
818 """
819 logger = ctx.logger.getChild(name)
820 return logger
821
822# ═════════════════════════════════════════════════════════════════════════════
823# ═══════════════════════════ DOMAIN MODELS ════════════════════════════════════
824# ═════════════════════════════════════════════════════════════════════════════
825
826@dataclass
827class SkypeMessage:
828 """Represents a single message in a Skype conversation."""
829 id: str
830 timestamp: datetime.datetime
831 sender_id: str
832 sender_display_name: str
833 content: str
834 message_type: str
835 edited: bool = False
836 original_json: Dict[str, Any] = field(default_factory=dict)
837
838 @property
839 def formatted_timestamp(self) -> str:
840 """Format the timestamp for display."""
841 return self.timestamp.strftime("%Y-%m-%d %H:%M:%S")
842
843 @property
844 def date(self) -> datetime.date:
845 """Get the date of the message."""
846 return self.timestamp.date()
847
848 @property
849 def time(self) -> datetime.time:
850 """Get the time of the message."""
851 return self.timestamp.time()
852
853@dataclass
854class SkypeConversation:
855 """Represents a Skype conversation with metadata and messages."""
856 id: str
857 display_name: str
858 messages: List[SkypeMessage] = field(default_factory=list)
859 first_timestamp: Optional[datetime.datetime] = None
860 last_timestamp: Optional[datetime.datetime] = None
861 participants: Dict[str, str] = field(default_factory=dict)
862 original_json: Dict[str, Any] = field(default_factory=dict)
863
864 def __post_init__(self):
865 """Calculate first and last timestamps after initialization."""
866 if self.messages:
867 message_timestamps = [m.timestamp for m in self.messages]
868 self.first_timestamp = min(message_timestamps)
869 self.last_timestamp = max(message_timestamps)
870
871 @property
872 def message_count(self) -> int:
873 """Get the total number of messages in the conversation."""
874 return len(self.messages)
875
876 @property
877 def duration(self) -> Optional[datetime.timedelta]:
878 """Get the duration of the conversation."""
879 if self.first_timestamp and self.last_timestamp:
880 return self.last_timestamp - self.first_timestamp
881 return None
882
883 @property
884 def days_active(self) -> Optional[int]:
885 """Get the number of days the conversation was active."""
886 if self.duration:
887 return self.duration.days
888 return None
889
890 def get_messages_by_date(self, date: datetime.date) -> List[SkypeMessage]:
891 """Get all messages from a specific date."""
892 return [msg for msg in self.messages if msg.date == date]
893
894 def get_message_dates(self) -> Set[datetime.date]:
895 """Get all unique dates when messages were sent."""
896 return {msg.date for msg in self.messages}
897
898 def add_message(self, message: SkypeMessage) -> None:
899 """Add a message to the conversation and update timestamps."""
900 self.messages.append(message)
901
902 # Update first/last timestamps
903 if not self.first_timestamp or message.timestamp < self.first_timestamp:
904 self.first_timestamp = message.timestamp
905
906 if not self.last_timestamp or message.timestamp > self.last_timestamp:
907 self.last_timestamp = message.timestamp
908
909@dataclass
910class SkypeExport:
911 """Represents a complete Skype export with metadata and conversations."""
912 user_id: str
913 export_date: datetime.datetime
914 conversations: Dict[str, SkypeConversation] = field(default_factory=dict)
915 original_json: Dict[str, Any] = field(default_factory=dict)
916
917 @property
918 def total_messages(self) -> int:
919 """Get the total number of messages across all conversations."""
920 return sum(conv.message_count for conv in self.conversations.values())
921
922 @property
923 def total_conversations(self) -> int:
924 """Get the total number of conversations."""
925 return len(self.conversations)
926
927 def get_conversation_by_id(self, id: str) -> Optional[SkypeConversation]:
928 """Get a conversation by its ID."""
929 return self.conversations.get(id)
930
931 def add_conversation(self, conversation: SkypeConversation) -> None:
932 """Add a conversation to the export."""
933 self.conversations[conversation.id] = conversation
934
935 def filter_conversations(self, pattern: str) -> List[SkypeConversation]:
936 """Filter conversations by display name pattern."""
937 return [conv for conv in self.conversations.values()
938 if fnmatch.fnmatch(conv.display_name.lower(), pattern.lower())]
939
940 def get_conversation_stats(self) -> Dict[str, Any]:
941 """Generate statistics about the conversations."""
942 stats = {
943 "total_conversations": self.total_conversations,
944 "total_messages": self.total_messages,
945 "conversation_details": []
946 }
947
948 for conv_id, conv in self.conversations.items():
949 # Skip empty conversations
950 if not conv.messages:
951 continue
952
953 conv_stats = {
954 "id": conv.id,
955 "display_name": conv.display_name,
956 "message_count": conv.message_count,
957 "days_active": conv.days_active,
958 "first_message": conv.first_timestamp.isoformat() if conv.first_timestamp else None,
959 "last_message": conv.last_timestamp.isoformat() if conv.last_timestamp else None,
960 "participants": len(conv.participants),
961 "participants_names": list(conv.participants.values()),
962 "message_types": {}
963 }
964
965 # Count message types
966 for msg in conv.messages:
967 if msg.message_type not in conv_stats["message_types"]:
968 conv_stats["message_types"][msg.message_type] = 0
969 conv_stats["message_types"][msg.message_type] += 1
970
971 stats["conversation_details"].append(conv_stats)
972
973 return stats
974
975# ═════════════════════════════════════════════════════════════════════════════
976# ═══════════════════════════ DATABASE MODELS ═════════════════════════════════
977# ═════════════════════════════════════════════════════════════════════════════
978
979if SQLALCHEMY_AVAILABLE:
980 Base = declarative_base()
981
982 class DbConversation(Base):
983 """Database model for Skype conversations."""
984 __tablename__ = 'conversations'
985
986 id = Column(String(255), primary_key=True)
987 display_name = Column(String(255), index=True)
988 first_timestamp = Column(DateTime, nullable=True, index=True)
989 last_timestamp = Column(DateTime, nullable=True, index=True)
990 message_count = Column(Integer, default=0)
991 days_active = Column(Integer, nullable=True)
992 export_date = Column(DateTime, nullable=False)
993 metadata_json = Column(Text, nullable=True)
994
995 # Relationships
996 messages = relationship("DbMessage", back_populates="conversation",
997 cascade="all, delete-orphan")
998 participants = relationship("DbParticipant", back_populates="conversation",
999 cascade="all, delete-orphan")
1000
1001 class DbMessage(Base):
1002 """Database model for Skype messages."""
1003 __tablename__ = 'messages'
1004
1005 id = Column(String(255), primary_key=True)
1006 conversation_id = Column(String(255), ForeignKey('conversations.id'), index=True)
1007 timestamp = Column(DateTime, nullable=False, index=True)
1008 sender_id = Column(String(255), index=True)
1009 sender_display_name = Column(String(255))
1010 content = Column(Text, nullable=True)
1011 message_type = Column(String(50), index=True)
1012 edited = Column(Boolean, default=False)
1013 metadata_json = Column(Text, nullable=True)
1014
1015 # Relationships
1016 conversation = relationship("DbConversation", back_populates="messages")
1017
1018 class DbParticipant(Base):
1019 """Database model for conversation participants."""
1020 __tablename__ = 'participants'
1021
1022 id = Column(Integer, primary_key=True, autoincrement=True)
1023 conversation_id = Column(String(255), ForeignKey('conversations.id'), index=True)
1024 user_id = Column(String(255), index=True)
1025 display_name = Column(String(255))
1026
1027 # Relationships
1028 conversation = relationship("DbConversation", back_populates="participants")
1029
1030 # Composite unique constraint
1031 __table_args__ = (
1032 sqlalchemy.UniqueConstraint('conversation_id', 'user_id', name='uq_participant'),
1033 )
1034
1035 class DbExportMeta(Base):
1036 """Database model for export metadata."""
1037 __tablename__ = 'export_metadata'
1038
1039 id = Column(Integer, primary_key=True, autoincrement=True)
1040 export_date = Column(DateTime, nullable=False, index=True)
1041 user_id = Column(String(255), index=True)
1042 user_display_name = Column(String(255))
1043 format = Column(String(50))
1044 total_conversations = Column(Integer, default=0)
1045 total_messages = Column(Integer, default=0)
1046 duration_seconds = Column(Integer, default=0)
1047 metadata_json = Column(Text, nullable=True)
1048
1049class DatabaseManager:
1050 """Manage database connections and operations."""
1051
1052 def __init__(self, ctx: AppContext):
1053 """
1054 Initialize the database manager.
1055
1056 Args:
1057 ctx: Application context
1058 """
1059 self.ctx = ctx
1060 self.logger = get_logger('database', ctx)
1061 self.engine = None
1062 self.session_factory = None
1063
1064 # Check required dependencies
1065 if not SQLALCHEMY_AVAILABLE:
1066 self.logger.error("SQLAlchemy is required for database operations but not installed")
1067 raise DatabaseError("SQLAlchemy is required but not installed")
1068
1069 if not PSYCOPG2_AVAILABLE and ctx.options.format == OutputFormat.POSTGRESQL:
1070 self.logger.error("psycopg2 is required for PostgreSQL export but not installed")
1071 raise DatabaseError("psycopg2 is required but not installed")
1072
1073 def initialize(self) -> None:
1074 """Initialize database connection and create schema if needed."""
1075 config = self.ctx.options.database_config
1076
1077 try:
1078 # Create engine with connection pooling
1079 self.engine = create_engine(
1080 config.connection_string,
1081 pool_size=config.connection_pool_size,
1082 max_overflow=config.connection_max_overflow,
1083 pool_timeout=config.connection_timeout,
1084 echo=config.echo_sql
1085 )
1086
1087 # Create session factory
1088 self.session_factory = sessionmaker(bind=self.engine)
1089
1090 # Create tables if they don't exist
1091 Base.metadata.create_all(self.engine)
1092
1093 self.logger.info(f"Connected to database: {config.engine}://{config.host}:{config.port}/{config.database}")
1094
1095 except Exception as e:
1096 self.logger.error(f"Database initialization error: {e}")
1097 raise DatabaseError(f"Failed to initialize database: {e}") from e
1098
1099 @contextmanager
1100 def session(self) -> Generator[Session, None, None]:
1101 """
1102 Get a database session with automatic cleanup.
1103
1104 Yields:
1105 SQLAlchemy session
1106 """
1107 if not self.session_factory:
1108 self.initialize()
1109
1110 session = self.session_factory()
1111 try:
1112 yield session
1113 session.commit()
1114 except Exception as e:
1115 session.rollback()
1116 self.logger.error(f"Database session error: {e}")
1117 raise
1118 finally:
1119 session.close()
1120
1121 def count_conversations(self) -> int:
1122 """
1123 Count conversations in the database.
1124
1125 Returns:
1126 Number of conversations
1127 """
1128 with self.session() as session:
1129 return session.query(DbConversation).count()
1130
1131 def count_messages(self) -> int:
1132 """
1133 Count messages in the database.
1134
1135 Returns:
1136 Number of messages
1137 """
1138 with self.session() as session:
1139 return session.query(DbMessage).count()
1140
1141 def create_export_metadata(self, skype_export: SkypeExport, duration_seconds: int) -> None:
1142 """
1143 Create export metadata record.
1144
1145 Args:
1146 skype_export: SkypeExport object
1147 duration_seconds: Export duration in seconds
1148 """
1149 with self.session() as session:
1150 meta = DbExportMeta(
1151 export_date=skype_export.export_date,
1152 user_id=skype_export.user_id,
1153 user_display_name=self.ctx.user_display_name,
1154 format=self.ctx.options.format.name,
1155 total_conversations=skype_export.total_conversations,
1156 total_messages=skype_export.total_messages,
1157 duration_seconds=duration_seconds,
1158 metadata_json=json.dumps({
1159 "export_date": self.ctx.export_date,
1160 "export_time": self.ctx.export_time,
1161 "options": {k: str(v) for k, v in dataclasses.asdict(self.ctx.options).items()
1162 if k != 'database_config'}
1163 })
1164 )
1165 session.add(meta)
1166
1167# ═════════════════════════════════════════════════════════════════════════════
1168# ═══════════════════════════ CORE PROCESSORS ═════════════════════════════════
1169# ═════════════════════════════════════════════════════════════════════════════
1170
1171class FileReader(ABC):
1172 """Abstract base class for reading different types of input files."""
1173
1174 @abstractmethod
1175 async def read(self, file_path: Path, ctx: AppContext) -> Dict[str, Any]:
1176 """
1177 Read and parse input file.
1178
1179 Args:
1180 file_path: Path to the input file
1181 ctx: Application context
1182
1183 Returns:
1184 Parsed content as dictionary
1185 """
1186 pass
1187
1188 @classmethod
1189 def create_reader(cls, file_path: Path) -> 'FileReader':
1190 """
1191 Factory method to create appropriate reader based on file extension.
1192
1193 Args:
1194 file_path: Path to input file
1195
1196 Returns:
1197 Appropriate FileReader instance
1198 """
1199 suffix = file_path.suffix.lower()
1200 if suffix == '.json':
1201 return JsonFileReader()
1202 elif suffix == '.tar' or suffix == '.gz' or suffix == '.tgz':
1203 return TarFileReader()
1204 elif suffix == '.zip':
1205 return ZipFileReader()
1206 else:
1207 raise ValueError(f"Unsupported file type: {suffix}")
1208
1209class JsonFileReader(FileReader):
1210 """Reader for JSON files."""
1211
1212 async def read(self, file_path: Path, ctx: AppContext) -> Dict[str, Any]:
1213 """Read a regular JSON file."""
1214 ctx.logger.debug(f"Reading JSON file: {file_path}")
1215 loop = asyncio.get_event_loop()
1216
1217 # Check file size - if large, use streaming parser
1218 file_size = file_path.stat().st_size
1219 large_file_threshold = 100 * 1024 * 1024 # 100 MB
1220
1221 if file_size > large_file_threshold:
1222 ctx.logger.info(f"Large JSON file detected ({file_size/1024/1024:.2f} MB). Using streaming parser.")
1223 try:
1224 # Use ijson for streaming if available
1225 import_result = importlib.util.find_spec("ijson")
1226 if import_result is not None:
1227 import ijson
1228 return await loop.run_in_executor(None, self._read_with_ijson, file_path)
1229 else:
1230 ctx.logger.warning("ijson package not available for streaming. Using standard JSON parser.")
1231 except ImportError:
1232 ctx.logger.warning("ijson import failed. Using standard JSON parser.")
1233
1234 # Default JSON loading for normal-sized files or if ijson fails
1235 try:
1236 return await loop.run_in_executor(None, self._read_standard_json, file_path)
1237 except json.JSONDecodeError as e:
1238 raise ParseError(f"Failed to parse JSON file {file_path}: {e}")
1239 except Exception as e:
1240 raise FileReadError(f"Failed to read JSON file {file_path}: {e}")
1241
1242 def _read_standard_json(self, file_path: Path) -> Dict[str, Any]:
1243 """Read a JSON file using the standard json module."""
1244 with open(file_path, 'r', encoding='utf-8') as f:
1245 return json.load(f)
1246
1247 def _read_with_ijson(self, file_path: Path) -> Dict[str, Any]:
1248 """Stream parse a large JSON file using ijson."""
1249 import ijson
1250
1251 result = {}
1252 with open(file_path, 'rb') as f:
1253 # Read top-level elements
1254 for prefix, event, value in ijson.parse(f):
1255 if prefix == '' and event == 'map_key':
1256 current_key = value
1257 elif prefix == '' and event in ('string', 'number', 'boolean'):
1258 result[current_key] = value
1259
1260 # Reopen file and stream the conversations array specifically
1261 f.seek(0)
1262 conversations = []
1263 for conversation in ijson.items(f, 'conversations.item'):
1264 conversations.append(conversation)
1265
1266 result['conversations'] = conversations
1267
1268 return result
1269
1270class TarFileReader(FileReader):
1271 """Reader for TAR file archives."""
1272
1273 async def read(self, file_path: Path, ctx: AppContext) -> Dict[str, Any]:
1274 """Read and extract a TAR archive."""
1275 ctx.logger.debug(f"Reading TAR file: {file_path}")
1276 loop = asyncio.get_event_loop()
1277
1278 try:
1279 return await loop.run_in_executor(None, self._process_tar, file_path, ctx)
1280 except Exception as e:
1281 ctx.logger.error(f"Error reading TAR file {file_path}: {e}")
1282 raise FileReadError(f"Failed to read TAR file: {e}")
1283
1284 def _process_tar(self, file_path: Path, ctx: AppContext) -> Dict[str, Any]:
1285 """Process TAR file contents in a separate thread."""
1286 with tarfile.open(file_path, 'r:*') as tar:
1287 # Extract all files to temporary directory
1288 temp_dir = Path(tempfile.mkdtemp(prefix="original_scripts.testing_"))
1289 try:
1290 tar.extractall(path=temp_dir)
1291 ctx.logger.debug(f"Extracted TAR contents to {temp_dir}")
1292
1293 # Find JSON files
1294 json_files = list(temp_dir.glob('**/*.json'))
1295
1296 # Check if we found any JSON files
1297 if not json_files:
1298 raise FileReadError(f"No JSON files found in TAR archive: {file_path}")
1299
1300 # Handle multiple JSON files
1301 if len(json_files) > 1:
1302 ctx.logger.warning(f"Multiple JSON files found in archive: {[f.name for f in json_files]}")
1303
1304 # In interactive/basic mode, prompt the user to select
1305 if hasattr(ctx, 'ui') and ctx.options.basic_mode:
1306 print("\nMultiple JSON files found in the archive:")
1307 for i, f in enumerate(json_files, 1):
1308 print(f" {i}: {f.name} ({f.stat().st_size / 1024 / 1024:.2f} MB)")
1309
1310 try:
1311 selection = input("\nEnter number to select (1-{}) or press Enter for first file: ".format(len(json_files)))
1312 if selection.strip():
1313 index = int(selection.strip()) - 1
1314 if 0 <= index < len(json_files):
1315 json_file = json_files[index]
1316 ctx.logger.info(f"Selected file: {json_file.name}")
1317 else:
1318 ctx.logger.warning(f"Invalid selection, using first file: {json_files[0].name}")
1319 json_file = json_files[0]
1320 else:
1321 ctx.logger.info(f"No selection made, using first file: {json_files[0].name}")
1322 json_file = json_files[0]
1323 except (ValueError, IndexError):
1324 ctx.logger.warning(f"Invalid input, using first file: {json_files[0].name}")
1325 json_file = json_files[0]
1326 else:
1327 # In non-interactive mode, use largest JSON file (likely the main export)
1328 json_file = max(json_files, key=lambda f: f.stat().st_size)
1329 ctx.logger.info(f"Selected largest JSON file: {json_file.name} ({json_file.stat().st_size / 1024 / 1024:.2f} MB)")
1330 else:
1331 json_file = json_files[0]
1332 ctx.logger.debug(f"Found JSON file: {json_file}")
1333
1334 # Read the selected JSON file
1335 with open(json_file, 'r', encoding='utf-8') as f:
1336 data = json.load(f)
1337
1338 return data
1339 finally:
1340 # Clean up temporary directory
1341 shutil.rmtree(temp_dir)
1342
1343class ZipFileReader(FileReader):
1344 """Reader for ZIP file archives."""
1345
1346 async def read(self, file_path: Path, ctx: AppContext) -> Dict[str, Any]:
1347 """Read and extract a ZIP archive."""
1348 ctx.logger.debug(f"Reading ZIP file: {file_path}")
1349 loop = asyncio.get_event_loop()
1350
1351 try:
1352 return await loop.run_in_executor(None, self._process_zip, file_path, ctx)
1353 except Exception as e:
1354 ctx.logger.error(f"Error reading ZIP file {file_path}: {e}")
1355 raise FileReadError(f"Failed to read ZIP file: {e}")
1356
1357 def _process_zip(self, file_path: Path, ctx: AppContext) -> Dict[str, Any]:
1358 """Process ZIP file contents in a separate thread."""
1359 with zipfile.ZipFile(file_path, 'r') as zip_file:
1360 # Extract all files to temporary directory
1361 temp_dir = Path(tempfile.mkdtemp(prefix="original_scripts.testing_"))
1362 try:
1363 zip_file.extractall(path=temp_dir)
1364 ctx.logger.debug(f"Extracted ZIP contents to {temp_dir}")
1365
1366 # Find JSON files
1367 json_files = list(temp_dir.glob('**/*.json'))
1368
1369 # Check if we found any JSON files
1370 if not json_files:
1371 raise FileReadError(f"No JSON files found in ZIP archive: {file_path}")
1372
1373 # Handle multiple JSON files
1374 if len(json_files) > 1:
1375 ctx.logger.warning(f"Multiple JSON files found in archive: {[f.name for f in json_files]}")
1376
1377 # In interactive/basic mode, prompt the user to select
1378 if hasattr(ctx, 'ui') and ctx.options.basic_mode:
1379 print("\nMultiple JSON files found in the archive:")
1380 for i, f in enumerate(json_files, 1):
1381 print(f" {i}: {f.name} ({f.stat().st_size / 1024 / 1024:.2f} MB)")
1382
1383 try:
1384 selection = input("\nEnter number to select (1-{}) or press Enter for first file: ".format(len(json_files)))
1385 if selection.strip():
1386 index = int(selection.strip()) - 1
1387 if 0 <= index < len(json_files):
1388 json_file = json_files[index]
1389 ctx.logger.info(f"Selected file: {json_file.name}")
1390 else:
1391 ctx.logger.warning(f"Invalid selection, using first file: {json_files[0].name}")
1392 json_file = json_files[0]
1393 else:
1394 ctx.logger.info(f"No selection made, using first file: {json_files[0].name}")
1395 json_file = json_files[0]
1396 except (ValueError, IndexError):
1397 ctx.logger.warning(f"Invalid input, using first file: {json_files[0].name}")
1398 json_file = json_files[0]
1399 else:
1400 # In non-interactive mode, use largest JSON file (likely the main export)
1401 json_file = max(json_files, key=lambda f: f.stat().st_size)
1402 ctx.logger.info(f"Selected largest JSON file: {json_file.name} ({json_file.stat().st_size / 1024 / 1024:.2f} MB)")
1403 else:
1404 json_file = json_files[0]
1405 ctx.logger.debug(f"Found JSON file: {json_file}")
1406
1407 # Read the selected JSON file
1408 with open(json_file, 'r', encoding='utf-8') as f:
1409 data = json.load(f)
1410
1411 return data
1412 finally:
1413 # Clean up temporary directory
1414 shutil.rmtree(temp_dir)
1415
1416class SkypeExportParser:
1417 """Parser for Skype export data."""
1418
1419 def __init__(self, ctx: AppContext):
1420 """
1421 Initialize the parser.
1422
1423 Args:
1424 ctx: Application context
1425 """
1426 self.ctx = ctx
1427 self.logger = get_logger('parser', ctx)
1428
1429 async def parse(self, data: Dict[str, Any]) -> SkypeExport:
1430 """
1431 Parse raw Skype export data into structured domain objects.
1432
1433 Args:
1434 data: Raw JSON data from Skype export
1435
1436 Returns:
1437 Structured SkypeExport object
1438 """
1439 self.logger.info("Parsing Skype export data...")
1440
1441 # Extract basic metadata
1442 user_id, export_date = self._extract_metadata(data)
1443
1444 # Create export object
1445 skype_export = SkypeExport(
1446 user_id=user_id,
1447 export_date=export_date,
1448 original_json=data
1449 )
1450
1451 # Build ID to display name mapping
1452 id_to_display_name = self._build_display_name_map(data)
1453
1454 # Process all conversations
1455 conversations = data.get('conversations', [])
1456 self.ctx.total_conversations = len(conversations)
1457
1458 # Optimize batch size if needed
1459 self._optimize_batch_size(conversations)
1460
1461 # Parse all conversations with progress tracking
1462 await self._parse_conversations_with_progress(conversations, id_to_display_name, skype_export)
1463
1464 self.logger.info(f"Parsed {skype_export.total_conversations} conversations with {skype_export.total_messages} messages")
1465 return skype_export
1466
1467 async def _parse_conversations_with_progress(self, conversations: List[Dict[str, Any]],
1468 id_to_display_name: Dict[str, str],
1469 skype_export: SkypeExport) -> None:
1470 """
1471 Parse conversations with progress tracking.
1472
1473 Args:
1474 conversations: List of conversation data
1475 id_to_display_name: Mapping of user IDs to display names
1476 skype_export: SkypeExport object
1477 """
1478 progress_tracker = self.ctx.progress_tracker
1479 if RICH_AVAILABLE and progress_tracker and not self.ctx.options.basic_mode:
1480 with progress_tracker as progress:
1481 task = progress.add_task("[cyan]Parsing conversations...", total=len(conversations))
1482 for i, conv_data in enumerate(conversations):
1483 conversation = await self._parse_conversation(conv_data, id_to_display_name)
1484 skype_export.add_conversation(conversation)
1485 progress.update(task, advance=1)
1486
1487 # Periodically check memory usage
1488 if i % 5 == 0:
1489 self.ctx.check_memory()
1490
1491 # Check for cancellation
1492 if self.ctx.cancel_requested:
1493 self.logger.info("Parsing cancelled by user")
1494 break
1495 else:
1496 # Simple parsing without rich progress bar
1497 for i, conv_data in enumerate(conversations):
1498 if i % 10 == 0:
1499 self.logger.info(f"Parsing conversation {i+1}/{len(conversations)}")
1500
1501 conversation = await self._parse_conversation(conv_data, id_to_display_name)
1502 skype_export.add_conversation(conversation)
1503
1504 # Periodically check memory usage
1505 if i % 5 == 0:
1506 self.ctx.check_memory()
1507
1508 # Check for cancellation
1509 if self.ctx.cancel_requested:
1510 self.logger.info("Parsing cancelled by user")
1511 break
1512
1513 async def _parse_conversation(self, conv_data: Dict[str, Any],
1514 id_to_display_name: Dict[str, str]) -> SkypeConversation:
1515 """
1516 Parse a single conversation from raw data.
1517
1518 Args:
1519 conv_data: Raw conversation data
1520 id_to_display_name: Mapping of user IDs to display names
1521
1522 Returns:
1523 Structured SkypeConversation object
1524 """
1525 conv_id = conv_data.get('id', '')
1526 display_name = conv_data.get('displayName', '')
1527
1528 # Handle missing display name
1529 if not display_name:
1530 # Try to extract from ID (typically format is "8:username")
1531 try:
1532 display_name = conv_id.split(':')[1]
1533 except (IndexError, AttributeError):
1534 display_name = f"Conversation {conv_id}"
1535
1536 # Update ID to display name mapping
1537 id_to_display_name[conv_id] = display_name
1538
1539 # Create conversation object
1540 conversation = SkypeConversation(
1541 id=conv_id,
1542 display_name=display_name,
1543 original_json=conv_data
1544 )
1545
1546 # Parse messages in parallel if enabled
1547 message_list = conv_data.get('MessageList', [])
1548
1549 if self.ctx.options.parallel and len(message_list) > 100 and not self.ctx.options.basic_mode:
1550 # Process messages in batches for large conversations
1551 loop = asyncio.get_event_loop()
1552
1553 # Use dynamic batch size based on memory constraints
1554 batch_size = self.ctx.options.batch_size
1555 batches = [message_list[i:i+batch_size] for i in range(0, len(message_list), batch_size)]
1556
1557 self.logger.debug(f"Processing {len(message_list)} messages in {len(batches)} batches "
1558 f"(batch size: {batch_size})")
1559
1560 with concurrent.futures.ThreadPoolExecutor(
1561 max_workers=self.ctx.options.max_workers
1562 ) as executor:
1563 # Process each batch in parallel
1564 tasks = []
1565 for batch in batches:
1566 task = loop.run_in_executor(
1567 executor,
1568 self._process_message_batch,
1569 batch,
1570 id_to_display_name,
1571 conversation
1572 )
1573 tasks.append(task)
1574
1575 # Wait for all batches to complete
1576 completed_count = 0
1577 for completed_task in await asyncio.gather(*tasks):
1578 completed_count += 1
1579
1580 # Periodically check memory usage
1581 if completed_count % 5 == 0:
1582 self.ctx.check_memory()
1583 else:
1584 # Process messages sequentially for smaller conversations
1585 for msg_data in message_list:
1586 message = self._parse_message(msg_data, id_to_display_name)
1587 conversation.add_message(message)
1588
1589 # Update participant mapping
1590 for message in conversation.messages:
1591 if message.sender_id not in conversation.participants:
1592 conversation.participants[message.sender_id] = message.sender_display_name
1593
1594 # Sort messages by timestamp
1595 conversation.messages.sort(key=lambda msg: msg.timestamp)
1596
1597 return conversation
1598
1599 def _process_message_batch(self, batch: List[Dict[str, Any]],
1600 id_to_display_name: Dict[str, str],
1601 conversation: SkypeConversation) -> List[SkypeMessage]:
1602 """
1603 Process a batch of messages in a separate thread.
1604
1605 Args:
1606 batch: List of raw message data
1607 id_to_display_name: Mapping of user IDs to display names
1608 conversation: Conversation to add messages to
1609
1610 Returns:
1611 List of parsed messages
1612 """
1613 messages = []
1614 for msg_data in batch:
1615 message = self._parse_message(msg_data, id_to_display_name)
1616 conversation.add_message(message)
1617 messages.append(message)
1618
1619 # Trigger garbage collection for very large batches to manage memory
1620 if len(batch) > 5000 and self.ctx.options.enable_memory_optimization:
1621 gc.collect()
1622
1623 return messages
1624
1625 def _parse_message(self, msg_data: Dict[str, Any],
1626 id_to_display_name: Dict[str, str]) -> SkypeMessage:
1627 """
1628 Parse a single message from raw data.
1629
1630 Args:
1631 msg_data: Raw message data
1632 id_to_display_name: Mapping of user IDs to display names
1633
1634 Returns:
1635 Structured SkypeMessage object
1636 """
1637 # Extract basic message data
1638 msg_id = msg_data.get('id', str(uuid.uuid4()))
1639
1640 # Parse timestamp
1641 timestamp_str = msg_data.get('originalarrivaltime', '')
1642 try:
1643 timestamp = datetime.datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
1644 except (ValueError, TypeError):
1645 self.logger.warning(f"Invalid timestamp format: {timestamp_str}")
1646 timestamp = datetime.datetime.now(datetime.timezone.utc)
1647
1648 # Convert to local time if requested
1649 if self.ctx.options.use_local_time:
1650 timestamp = timestamp.astimezone()
1651
1652 # Extract sender info
1653 sender_id = msg_data.get('from', '')
1654 sender_display_name = id_to_display_name.get(sender_id, sender_id)
1655
1656 # Extract content and type
1657 content = msg_data.get('content', '')
1658 msg_type = msg_data.get('messagetype', 'unknown')
1659
1660 # Special handling for non-text message types
1661 if msg_type != 'RichText':
1662 content = self._get_message_type_description(msg_type)
1663
1664 # Check for edited messages
1665 edited = bool(re.search(r'<e_m.*>', content))
1666
1667 # Create message object
1668 message = SkypeMessage(
1669 id=msg_id,
1670 timestamp=timestamp,
1671 sender_id=sender_id,
1672 sender_display_name=sender_display_name,
1673 content=content,
1674 message_type=msg_type,
1675 edited=edited,
1676 original_json=msg_data
1677 )
1678
1679 return message
1680
1681 def _get_message_type_description(self, msg_type: str) -> str:
1682 """
1683 Convert Skype message type to human-readable description.
1684
1685 Args:
1686 msg_type: Skype message type
1687
1688 Returns:
1689 Human-readable description
1690 """
1691 type_descriptions = {
1692 'Event/Call': '***A call started/ended***',
1693 'Poll': '***Created a poll***',
1694 'RichText/Media_Album': '***Sent an album of images***',
1695 'RichText/Media_AudioMsg': '***Sent a voice message***',
1696 'RichText/Media_CallRecording': '***Sent a call recording***',
1697 'RichText/Media_Card': '***Sent a media card***',
1698 'RichText/Media_FlikMsg': '***Sent a moji***',
1699 'RichText/Media_GenericFile': '***Sent a file***',
1700 'RichText/Media_Video': '***Sent a video message***',
1701 'RichText/UriObject': '***Sent a photo***',
1702 'RichText/ScheduledCallInvite': '***Scheduled a call***',
1703 'RichText/Location': '***Sent a location***',
1704 'RichText/Contacts': '***Sent a contact***',
1705 }
1706
1707 return type_descriptions.get(msg_type, f'***Sent a {msg_type}***')
1708
1709 def _optimize_batch_size(self, conversations: List[Dict[str, Any]]) -> None:
1710 """
1711 Calculate optimal batch size based on data volume.
1712
1713 Args:
1714 conversations: List of raw conversation data
1715 """
1716 # Only optimize if memory monitoring is enabled
1717 if not (self.ctx.options.enable_memory_optimization and self.ctx.memory_monitor):
1718 return
1719
1720 # Adjust batch size based on number of conversations and available memory
1721 conversation_count = len(conversations)
1722 estimated_total_messages = 0
1723
1724 # Sample a few conversations to estimate total message count
1725 sample_size = min(10, conversation_count)
1726 for i in range(sample_size):
1727 conv_data = conversations[i]
1728 estimated_total_messages += len(conv_data.get('MessageList', []))
1729
1730 if sample_size > 0:
1731 avg_messages = estimated_total_messages / sample_size
1732 estimated_total = avg_messages * conversation_count
1733
1734 # Adjust batch size if total is large
1735 if estimated_total > 100000:
1736 optimal_batch_size = self.ctx.memory_monitor.calculate_optimal_batch_size(
1737 int(estimated_total)
1738 )
1739 self.logger.info(f"Adjusting batch size to {optimal_batch_size} "
1740 f"based on estimated {estimated_total:.0f} messages")
1741 self.ctx.options.batch_size = optimal_batch_size
1742
1743 def _extract_metadata(self, data: Dict[str, Any]) -> Tuple[str, datetime.datetime]:
1744 """
1745 Extract user ID and export date from the export data.
1746
1747 Args:
1748 data: Raw JSON data from Skype export
1749
1750 Returns:
1751 Tuple of (user_id, export_date)
1752 """
1753 # Default values
1754 user_id = "unknown"
1755 export_date = datetime.datetime.now()
1756
1757 # Try to extract user ID
1758 if "userId" in data:
1759 user_id = data["userId"]
1760 elif "creator" in data:
1761 user_id = data["creator"]
1762 elif "exportedBy" in data:
1763 user_id = data["exportedBy"]
1764
1765 # Try to extract export date
1766 if "exportDate" in data:
1767 try:
1768 if isinstance(data["exportDate"], str):
1769 # Try ISO format first
1770 try:
1771 export_date = datetime.datetime.fromisoformat(data["exportDate"])
1772 except ValueError:
1773 # Try various date formats
1774 for fmt in ["%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y"]:
1775 try:
1776 export_date = datetime.datetime.strptime(data["exportDate"], fmt)
1777 break
1778 except ValueError:
1779 continue
1780 elif isinstance(data["exportDate"], int):
1781 # Assume Unix timestamp (seconds since epoch)
1782 export_date = datetime.datetime.fromtimestamp(data["exportDate"])
1783 except Exception as e:
1784 self.logger.warning(f"Failed to parse export date: {e}")
1785
1786 # If we still don't have a user ID, try to extract from file metadata
1787 if user_id == "unknown" and "personaList" in data:
1788 for persona in data["personaList"]:
1789 if "cid" in persona:
1790 user_id = persona["cid"]
1791 break
1792
1793 return user_id, export_date
1794
1795 def _build_display_name_map(self, data: Dict[str, Any]) -> Dict[str, str]:
1796 """
1797 Build a mapping from user IDs to display names.
1798
1799 Args:
1800 data: Raw JSON data from Skype export
1801
1802 Returns:
1803 Dictionary mapping user IDs to display names
1804 """
1805 id_to_display_name = {}
1806
1807 # Extract from personas list if available
1808 if "personaList" in data:
1809 for persona in data["personaList"]:
1810 if "cid" in persona and "displayName" in persona:
1811 id_to_display_name[persona["cid"]] = persona["displayName"]
1812
1813 # Extract from conversations/chats if available
1814 if "conversations" in data:
1815 for conv in data["conversations"]:
1816 if "id" in conv and "displayName" in conv:
1817 id_to_display_name[conv["id"]] = conv["displayName"]
1818
1819 if "chats" in data:
1820 for chat in data["chats"]:
1821 if "id" in chat and "threadProperties" in chat and "topic" in chat["threadProperties"]:
1822 id_to_display_name[chat["id"]] = chat["threadProperties"]["topic"]
1823 elif "id" in chat and "displayName" in chat:
1824 id_to_display_name[chat["id"]] = chat["displayName"]
1825
1826 return id_to_display_name
1827
1828class ContentFormatter:
1829 """Base class for content formatting with rich formatting support."""
1830
1831 def __init__(self, ctx: AppContext):
1832 """
1833 Initialize the formatter.
1834
1835 Args:
1836 ctx: Application context
1837 """
1838 self.ctx = ctx
1839 self.logger = get_logger('formatter', ctx)
1840
1841 def format_timestamp(self, timestamp: datetime.datetime) -> str:
1842 """Format timestamp for display."""
1843 return timestamp.strftime("%Y-%m-%d %H:%M:%S")
1844
1845 def format_message(self, message: SkypeMessage) -> str:
1846 """
1847 Format a message for display.
1848
1849 Args:
1850 message: Message to format
1851
1852 Returns:
1853 Formatted message string
1854 """
1855 timestamp = ""
1856 if self.ctx.options.include_timestamps:
1857 timestamp = f"[{self.format_timestamp(message.timestamp)}] "
1858
1859 formatted = f"{timestamp}{message.sender_display_name}: {message.content}"
1860 return formatted
1861
1862 def create_banner(self, conversation: SkypeConversation) -> str:
1863 """
1864 Create a banner with conversation metadata.
1865
1866 Args:
1867 conversation: Conversation to create banner for
1868
1869 Returns:
1870 Banner string
1871 """
1872 banner = [
1873 f"Conversation with: {conversation.display_name} ({conversation.id})",
1874 f"Exported on: {self.ctx.export_date}, at: {self.ctx.export_time}",
1875 ]
1876
1877 if conversation.first_timestamp and conversation.last_timestamp:
1878 banner.extend([
1879 f"Conversations From: {self.format_timestamp(conversation.first_timestamp)}",
1880 f" To: {self.format_timestamp(conversation.last_timestamp)}",
1881 ])
1882
1883 banner.append("***** All times are in UTC *****" if not self.ctx.options.use_local_time
1884 else "***** All times are in local time *****")
1885
1886 return "\n".join(banner)
1887
1888 def parse_content(self, content: str) -> str:
1889 """
1890 Parse and clean message content.
1891
1892 Args:
1893 content: Raw message content
1894
1895 Returns:
1896 Cleaned content
1897 """
1898 if self.ctx.options.strip_html:
1899 if BEAUTIFULSOUP_AVAILABLE:
1900 return self._parse_with_beautifulsoup(content)
1901 else:
1902 return self._parse_with_regex(content)
1903 return content
1904
1905 def _parse_with_beautifulsoup(self, content: str) -> str:
1906 """
1907 Parse content using BeautifulSoup.
1908
1909 Args:
1910 content: Raw HTML content
1911
1912 Returns:
1913 Plain text content
1914 """
1915 soup = BeautifulSoup(content, 'lxml')
1916 text = soup.get_text()
1917 return self._pretty_quotes(text)
1918
1919 def _parse_with_regex(self, content: str) -> str:
1920 """
1921 Parse content using regex fallback.
1922
1923 Args:
1924 content: Raw HTML content
1925
1926 Returns:
1927 Plain text content
1928 """
1929 tag_pattern = re.compile(r'<.*?>')
1930 content = tag_pattern.sub('', content)
1931 content = html.unescape(content)
1932 return self._pretty_quotes(content)
1933
1934 def _pretty_quotes(self, text: str) -> str:
1935 """
1936 Format quotes for better readability.
1937
1938 Args:
1939 text: Text with quote markers
1940
1941 Returns:
1942 Text with formatted quotes
1943 """
1944 # Replace quote markers with more readable format
1945 quote_pattern = re.compile(r'\[[+-]?\d+(?:\.\d+)?\]')
1946 text = quote_pattern.sub(r'\n\t*** Quoting the following message: ***\n\t', text)
1947
1948 response_pattern = re.compile(r'\<\<\<')
1949 text = response_pattern.sub('\t*** And responding with: ***\n\t', text)
1950
1951 return text
1952
1953class TextExporter:
1954 """Exports conversations to plain text format."""
1955
1956 def __init__(self, ctx: AppContext):
1957 """Initialize text exporter with application context."""
1958 self.ctx = ctx
1959 self.logger = ctx.logger.getChild('exporter.text')
1960 self.formatter = ContentFormatter(ctx)
1961 # Check if aiofiles is available
1962 self.aiofiles_available = importlib.util.find_spec("aiofiles") is not None
1963 if self.aiofiles_available:
1964 self.logger.debug("aiofiles is available, will use for async file operations")
1965 import aiofiles
1966 self.aiofiles = aiofiles
1967 else:
1968 self.logger.debug("aiofiles not available, using custom async file wrapper")
1969
1970 async def export_conversation(self, conversation: SkypeConversation, output_dir: Path) -> Path:
1971 """
1972 Export a conversation to a text file.
1973
1974 Args:
1975 conversation: Conversation to export
1976 output_dir: Output directory
1977
1978 Returns:
1979 Path to the exported file
1980 """
1981 self.logger.info(f"Exporting conversation '{conversation.display_name}' to text")
1982
1983 # Create file name from conversation display name
1984 file_name = sanitize_filename(conversation.display_name)
1985 output_path = get_unique_filename(output_dir, file_name, "txt")
1986
1987 # Prepare content
1988 content = []
1989
1990 # Add banner with conversation info
1991 content.append(self.formatter.create_banner(conversation))
1992 content.append("") # Empty line after banner
1993
1994 # Group messages by date
1995 message_dates = sorted(conversation.get_message_dates())
1996
1997 # Process each date
1998 for date in message_dates:
1999 # Add date header
2000 date_str = date.strftime("%A, %B %d, %Y")
2001 content.append(f"\n=== {date_str} ===\n")
2002
2003 # Add messages for this date
2004 messages = conversation.get_messages_by_date(date)
2005 for message in messages:
2006 content.append(self.formatter.format_message(message))
2007
2008 # Write to file using async I/O
2009 try:
2010 if self.aiofiles_available:
2011 # Use aiofiles for truly async I/O
2012 async with self.aiofiles.open(output_path, 'w', encoding='utf-8') as f:
2013 await f.write('\n'.join(content))
2014 else:
2015 # Fall back to custom async wrapper
2016 with self._async_open(output_path, 'w', encoding='utf-8') as f:
2017 await f.write('\n'.join(content))
2018
2019 self.logger.info(f"Exported to {output_path}")
2020 return output_path
2021
2022 except Exception as e:
2023 self.logger.error(f"Failed to write text file: {e}")
2024 raise FileWriteError(f"Failed to write text file: {e}")
2025
2026 @contextmanager
2027 def _async_open(self, file_path: Path, mode: str, **kwargs):
2028 """
2029 Context manager for async file operations.
2030
2031 Args:
2032 file_path: Path to file
2033 mode: File mode
2034 **kwargs: Additional open arguments
2035
2036 Yields:
2037 AsyncFile object
2038 """
2039 class AsyncFile:
2040 def __init__(self, file_obj):
2041 self.file_obj = file_obj
2042
2043 async def write(self, content):
2044 loop = asyncio.get_event_loop()
2045 await loop.run_in_executor(None, self.file_obj.write, content)
2046
2047 async def read(self):
2048 loop = asyncio.get_event_loop()
2049 return await loop.run_in_executor(None, self.file_obj.read)
2050
2051 file_obj = open(file_path, mode, **kwargs)
2052 try:
2053 yield AsyncFile(file_obj)
2054 finally:
2055 file_obj.close()
2056
2057class HtmlExporter:
2058 """Exporter for HTML format with styling."""
2059
2060 def __init__(self, ctx: AppContext):
2061 """
2062 Initialize the exporter.
2063
2064 Args:
2065 ctx: Application context
2066 """
2067 self.ctx = ctx
2068 self.formatter = ContentFormatter(ctx)
2069 self.logger = get_logger('exporter.html', ctx)
2070
2071 # Check for required dependencies
2072 if not JINJA2_AVAILABLE:
2073 ctx.logger.warning("Jinja2 not installed. HTML export will use basic formatting.")
2074
2075 async def export_conversation(self, conversation: SkypeConversation, output_dir: Path) -> Path:
2076 """
2077 Export a conversation to HTML format.
2078
2079 Args:
2080 conversation: Conversation to export
2081 output_dir: Directory to write output to
2082
2083 Returns:
2084 Path to the exported file
2085 """
2086 self.logger.debug(f"Exporting conversation {conversation.display_name} to HTML")
2087
2088 # Create filename with enhanced sanitization
2089 safe_name = sanitize_filename(conversation.display_name)
2090 filename = f"[{self.ctx.export_date}]-{safe_name}.html"
2091 output_path = output_dir / filename
2092
2093 # Group messages by date
2094 message_groups = {}
2095 for date in sorted(conversation.get_message_dates()):
2096 message_groups[date.isoformat()] = conversation.get_messages_by_date(date)
2097
2098 # Generate HTML
2099 if JINJA2_AVAILABLE:
2100 html_content = self._generate_html_with_jinja(conversation, message_groups)
2101 else:
2102 html_content = self._generate_basic_html(conversation, message_groups)
2103
2104 # Write to file
2105 try:
2106 loop = asyncio.get_event_loop()
2107 await loop.run_in_executor(
2108 None,
2109 lambda: output_path.write_text(html_content, encoding='utf-8')
2110 )
2111 except Exception as e:
2112 self.logger.error(f"Error writing to {output_path}: {e}")
2113 raise FileWriteError(f"Failed to write HTML to {output_path}: {e}")
2114
2115 self.logger.info(f"Exported {conversation.message_count} messages to {output_path}")
2116 return output_path
2117
2118 def _generate_html_with_jinja(self, conversation: SkypeConversation,
2119 message_groups: Dict[str, List[SkypeMessage]]) -> str:
2120 """
2121 Generate HTML using Jinja2 templates.
2122
2123 Args:
2124 conversation: Conversation to export
2125 message_groups: Messages grouped by date
2126
2127 Returns:
2128 Generated HTML string
2129 """
2130 # Create template
2131 template_str = """
2132 <!DOCTYPE html>
2133 <html lang="en">
2134 <head>
2135 <meta charset="UTF-8">
2136 <meta name="viewport" content="width=device-width, initial-scale=1.0">
2137 <title>{{ conversation.display_name }} - Skype Chat</title>
2138 <style>
2139 body {
2140 font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
2141 line-height: 1.6;
2142 color: #333;
2143 max-width: 800px;
2144 margin: 0 auto;
2145 padding: 20px;
2146 }
2147 .header {
2148 background-color: #00aff0;
2149 color: white;
2150 padding: 15px;
2151 border-radius: 5px;
2152 margin-bottom: 20px;
2153 }
2154 .date-header {
2155 background-color: #e6e6e6;
2156 padding: 8px 15px;
2157 border-radius: 5px;
2158 margin: 25px 0 15px 0;
2159 font-weight: bold;
2160 }
2161 .message {
2162 margin-bottom: 10px;
2163 padding: 10px;
2164 border-radius: 5px;
2165 }
2166 .message:nth-child(odd) {
2167 background-color: #f5f5f5;
2168 }
2169 .timestamp {
2170 color: #777;
2171 font-size: 0.85em;
2172 margin-right: 10px;
2173 }
2174 .sender {
2175 font-weight: bold;
2176 margin-right: 10px;
2177 }
2178 .content {
2179 white-space: pre-wrap;
2180 }
2181 .quote {
2182 border-left: 3px solid #00aff0;
2183 padding-left: 10px;
2184 color: #555;
2185 font-style: italic;
2186 }
2187 .metadata {
2188 font-size: 0.9em;
2189 color: #777;
2190 }
2191 .edited {
2192 color: #999;
2193 font-style: italic;
2194 font-size: 0.85em;
2195 }
2196 .special {
2197 color: #777;
2198 font-style: italic;
2199 }
2200 </style>
2201 </head>
2202 <body>
2203 <div class="header">
2204 <h1>{{ conversation.display_name }}</h1>
2205 <div class="metadata">
2206 <p>Exported on: {{ export_date }}, at: {{ export_time }}</p>
2207 {% if conversation.first_timestamp %}
2208 <p>Conversations from: {{ formatter.format_timestamp(conversation.first_timestamp) }}</p>
2209 <p>To: {{ formatter.format_timestamp(conversation.last_timestamp) }}</p>
2210 {% endif %}
2211 <p>{{ time_zone_note }}</p>
2212 </div>
2213 </div>
2214
2215 {% for date, messages in message_groups.items() %}
2216 <div class="date-header">Conversations on {{ date }}</div>
2217
2218 {% for message in messages %}
2219 <div class="message">
2220 {% if include_timestamps %}
2221 <span class="timestamp">[{{ formatter.format_timestamp(message.timestamp) }}]</span>
2222 {% endif %}
2223 <span class="sender">{{ message.sender_display_name }}:</span>
2224
2225 {% if message.message_type != 'RichText' %}
2226 <span class="special">{{ message.content }}</span>
2227 {% else %}
2228 <span class="content">{{ formatter.parse_content(message.content) }}</span>
2229 {% if message.edited %}
2230 <div class="edited">This message was edited</div>
2231 {% endif %}
2232 {% endif %}
2233 </div>
2234 {% endfor %}
2235 {% endfor %}
2236 </body>
2237 </html>
2238 """
2239
2240 # Create template and render
2241 template = jinja2.Template(template_str)
2242 return template.render(
2243 conversation=conversation,
2244 message_groups=message_groups,
2245 formatter=self.formatter,
2246 export_date=self.ctx.export_date,
2247 export_time=self.ctx.export_time,
2248 include_timestamps=self.ctx.options.include_timestamps,
2249 time_zone_note="All times are in UTC" if not self.ctx.options.use_local_time else "All times are in local time"
2250 )
2251
2252 def _generate_basic_html(self, conversation: SkypeConversation,
2253 message_groups: Dict[str, List[SkypeMessage]]) -> str:
2254 """
2255 Generate basic HTML without Jinja2.
2256
2257 Args:
2258 conversation: Conversation to export
2259 message_groups: Messages grouped by date
2260
2261 Returns:
2262 Generated HTML string
2263 """
2264 # Create HTML pieces
2265 html_parts = [
2266 '<!DOCTYPE html>',
2267 '<html lang="en">',
2268 '<head>',
2269 ' <meta charset="UTF-8">',
2270 f' <title>{html.escape(conversation.display_name)} - Skype Chat</title>',
2271 ' <style>',
2272 ' body { font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }',
2273 ' .header { background-color: #00aff0; color: white; padding: 15px; }',
2274 ' .date-header { background-color: #e6e6e6; padding: 8px; margin: 20px 0 10px 0; }',
2275 ' .message { margin-bottom: 10px; padding: 8px; }',
2276 ' .message:nth-child(odd) { background-color: #f5f5f5; }',
2277 ' </style>',
2278 '</head>',
2279 '<body>',
2280 f' <div class="header"><h1>{html.escape(conversation.display_name)}</h1>',
2281 f' <p>Exported on: {self.ctx.export_date}, at: {self.ctx.export_time}</p>'
2282 ]
2283
2284 if conversation.first_timestamp and conversation.last_timestamp:
2285 html_parts.extend([
2286 f' <p>Conversations from: {self.formatter.format_timestamp(conversation.first_timestamp)}</p>',
2287 f' <p>To: {self.formatter.format_timestamp(conversation.last_timestamp)}</p>'
2288 ])
2289
2290 time_note = "All times are in UTC" if not self.ctx.options.use_local_time else "All times are in local time"
2291 html_parts.append(f' <p>{time_note}</p>')
2292 html_parts.append(' </div>')
2293
2294 # Add messages grouped by date
2295 for date, messages in message_groups.items():
2296 html_parts.append(f' <div class="date-header">Conversations on {date}</div>')
2297
2298 for message in messages:
2299 html_parts.append(' <div class="message">')
2300
2301 if self.ctx.options.include_timestamps:
2302 html_parts.append(f' <span>[{self.formatter.format_timestamp(message.timestamp)}]</span>')
2303
2304 html_parts.append(f' <strong>{html.escape(message.sender_display_name)}:</strong> ')
2305
2306 # Handle different message types
2307 if message.message_type != 'RichText':
2308 html_parts.append(f' <em>{html.escape(message.content)}</em>')
2309 else:
2310 content = self.formatter.parse_content(message.content)
2311 html_parts.append(f' <span>{html.escape(content)}</span>')
2312
2313 if message.edited:
2314 html_parts.append(' <div><em>This message was edited</em></div>')
2315
2316 html_parts.append(' </div>')
2317
2318 html_parts.extend(['</body>', '</html>'])
2319 return '\n'.join(html_parts)
2320
2321class MarkdownExporter:
2322 """Exporter for Markdown format."""
2323
2324 def __init__(self, ctx: AppContext):
2325 """
2326 Initialize the exporter.
2327
2328 Args:
2329 ctx: Application context
2330 """
2331 self.ctx = ctx
2332 self.formatter = ContentFormatter(ctx)
2333 self.logger = get_logger('exporter.markdown', ctx)
2334
2335 async def export_conversation(self, conversation: SkypeConversation, output_dir: Path) -> Path:
2336 """
2337 Export a conversation to Markdown format.
2338
2339 Args:
2340 conversation: Conversation to export
2341 output_dir: Directory to write output to
2342
2343 Returns:
2344 Path to the exported file
2345 """
2346 self.logger.debug(f"Exporting conversation {conversation.display_name} to Markdown")
2347
2348 # Create filename with enhanced sanitization
2349 safe_name = sanitize_filename(conversation.display_name)
2350 filename = f"[{self.ctx.export_date}]-{safe_name}.md"
2351 output_path = output_dir / filename
2352
2353 # Create banner
2354 content = [
2355 f"# Conversation with {conversation.display_name}",
2356 "",
2357 "## Metadata",
2358 f"- **Exported on:** {self.ctx.export_date}, at: {self.ctx.export_time}"
2359 ]
2360
2361 if conversation.first_timestamp and conversation.last_timestamp:
2362 content.extend([
2363 f"- **First message:** {self.formatter.format_timestamp(conversation.first_timestamp)}",
2364 f"- **Last message:** {self.formatter.format_timestamp(conversation.last_timestamp)}"
2365 ])
2366
2367 time_note = "All times are in UTC" if not self.ctx.options.use_local_time else "All times are in local time"
2368 content.append(f"- **Note:** {time_note}")
2369 content.append("")
2370
2371 # Group messages by date
2372 for date in sorted(conversation.get_message_dates()):
2373 date_messages = conversation.get_messages_by_date(date)
2374 if date_messages:
2375 content.append(f"## Conversations on {date.isoformat()}")
2376 content.append("")
2377
2378 for message in date_messages:
2379 # Format timestamp
2380 timestamp = ""
2381 if self.ctx.options.include_timestamps:
2382 timestamp = f"**[{self.formatter.format_timestamp(message.timestamp)}]** "
2383
2384 # Format sender
2385 sender = f"**{message.sender_display_name}:** "
2386
2387 # Format content
2388 if message.message_type != 'RichText':
2389 msg_content = f"*{message.content}*"
2390 else:
2391 msg_content = self.formatter.parse_content(message.content)
2392 # Escape markdown characters in content
2393 msg_content = re.sub(r'([_*~`#])', r'\\\1', msg_content)
2394
2395 # Add edited indicator
2396 if message.edited:
2397 msg_content += " *(edited)*"
2398
2399 # Add complete message
2400 content.append(f"{timestamp}{sender}{msg_content}")
2401 content.append("")
2402
2403 # Write to file
2404 try:
2405 loop = asyncio.get_event_loop()
2406 await loop.run_in_executor(
2407 None,
2408 lambda: output_path.write_text('\n'.join(content), encoding='utf-8')
2409 )
2410 except Exception as e:
2411 self.logger.error(f"Error writing to {output_path}: {e}")
2412 raise FileWriteError(f"Failed to write Markdown to {output_path}: {e}")
2413
2414 self.logger.info(f"Exported {conversation.message_count} messages to {output_path}")
2415 return output_path
2416
2417class JsonExporter:
2418 """Exporter for JSON format with full message data."""
2419
2420 def __init__(self, ctx: AppContext):
2421 """
2422 Initialize the exporter.
2423
2424 Args:
2425 ctx: Application context
2426 """
2427 self.ctx = ctx
2428 self.logger = get_logger('exporter.json', ctx)
2429
2430 async def export_conversation(self, conversation: SkypeConversation, output_dir: Path) -> Path:
2431 """
2432 Export a conversation to JSON format.
2433
2434 Args:
2435 conversation: Conversation to export
2436 output_dir: Directory to write output to
2437
2438 Returns:
2439 Path to the exported file
2440 """
2441 self.logger.debug(f"Exporting conversation {conversation.display_name} to JSON")
2442
2443 # Create filename with enhanced sanitization
2444 safe_name = sanitize_filename(conversation.display_name)
2445 filename = f"[{self.ctx.export_date}]-{safe_name}.json"
2446 output_path = output_dir / filename
2447
2448 # Create serializable data structure
2449 data = {
2450 "metadata": {
2451 "id": conversation.id,
2452 "display_name": conversation.display_name,
2453 "export_date": self.ctx.export_date,
2454 "export_time": self.ctx.export_time,
2455 "message_count": conversation.message_count,
2456 "first_message": conversation.first_timestamp.isoformat() if conversation.first_timestamp else None,
2457 "last_message": conversation.last_timestamp.isoformat() if conversation.last_timestamp else None,
2458 "participants": conversation.participants,
2459 "timezone": "UTC" if not self.ctx.options.use_local_time else "local"
2460 },
2461 "messages": []
2462 }
2463
2464 # Add messages
2465 for message in conversation.messages:
2466 msg_data = {
2467 "id": message.id,
2468 "timestamp": message.timestamp.isoformat(),
2469 "sender_id": message.sender_id,
2470 "sender_display_name": message.sender_display_name,
2471 "content": message.content,
2472 "message_type": message.message_type,
2473 "edited": message.edited
2474 }
2475
2476 # Include original JSON if requested
2477 if self.ctx.options.include_message_ids:
2478 msg_data["original_json"] = message.original_json
2479
2480 data["messages"].append(msg_data)
2481
2482 # Write to file with indentation if pretty print is enabled
2483 indent = 2 if self.ctx.options.pretty_print else None
2484
2485 try:
2486 loop = asyncio.get_event_loop()
2487 await loop.run_in_executor(
2488 None,
2489 lambda: output_path.write_text(
2490 json.dumps(data, indent=indent, ensure_ascii=False),
2491 encoding='utf-8'
2492 )
2493 )
2494 except Exception as e:
2495 self.logger.error(f"Error writing to {output_path}: {e}")
2496 raise FileWriteError(f"Failed to write JSON to {output_path}: {e}")
2497
2498 self.logger.info(f"Exported {conversation.message_count} messages to {output_path}")
2499 return output_path
2500
2501class PostgreSQLExporter:
2502 """Exporter for PostgreSQL database with normalized schema."""
2503
2504 def __init__(self, ctx: AppContext):
2505 """
2506 Initialize the exporter.
2507
2508 Args:
2509 ctx: Application context
2510 """
2511 self.ctx = ctx
2512 self.logger = get_logger('exporter.postgresql', ctx)
2513
2514 # Check for required dependencies
2515 if not SQLALCHEMY_AVAILABLE:
2516 raise ExportError("SQLAlchemy is required for PostgreSQL export but not installed")
2517
2518 if not PSYCOPG2_AVAILABLE:
2519 raise ExportError("psycopg2 is required for PostgreSQL export but not installed")
2520
2521 # Initialize database manager
2522 self.db_manager = DatabaseManager(ctx)
2523
2524 # Assign DB model classes to instance attributes for use in queries
2525 self.DbConversation = DbConversation
2526 self.DbMessage = DbMessage
2527 self.DbParticipant = DbParticipant
2528
2529 async def export_conversation(self, conversation: SkypeConversation, output_dir: Path) -> Path:
2530 """
2531 Export a conversation to PostgreSQL database.
2532
2533 Args:
2534 conversation: Conversation to export
2535 output_dir: Directory to write output to
2536
2537 Returns:
2538 Path to a metadata file with export info
2539 """
2540 self.logger.debug(f"Exporting conversation {conversation.display_name} to PostgreSQL")
2541
2542 # Create metadata file
2543 safe_name = sanitize_filename(conversation.display_name)
2544 filename = f"[{self.ctx.export_date}]-{safe_name}-pg_export_info.json"
2545 output_path = output_dir / filename
2546
2547 try:
2548 # Initialize database connection
2549 if not hasattr(self, '_db_initialized'):
2550 self.db_manager.initialize()
2551 self._db_initialized = True
2552
2553 # Export conversation to database
2554 await self._export_to_database(conversation)
2555
2556 # Create a metadata file with export information
2557 meta_data = {
2558 "export_type": "PostgreSQL",
2559 "conversation": {
2560 "id": conversation.id,
2561 "display_name": conversation.display_name,
2562 "message_count": conversation.message_count,
2563 "first_message": conversation.first_timestamp.isoformat() if conversation.first_timestamp else None,
2564 "last_message": conversation.last_timestamp.isoformat() if conversation.last_timestamp else None,
2565 },
2566 "database": {
2567 "engine": self.ctx.options.database_config.engine,
2568 "host": self.ctx.options.database_config.host,
2569 "port": self.ctx.options.database_config.port,
2570 "database": self.ctx.options.database_config.database,
2571 "schema": self.ctx.options.database_config.schema,
2572 },
2573 "export_date": self.ctx.export_date,
2574 "export_time": self.ctx.export_time,
2575 "sql_connection_string": self.get_sanitized_connection_string()
2576 }
2577
2578 loop = asyncio.get_event_loop()
2579 await loop.run_in_executor(
2580 None,
2581 lambda: output_path.write_text(
2582 json.dumps(meta_data, indent=2, ensure_ascii=False),
2583 encoding='utf-8'
2584 )
2585 )
2586
2587 self.logger.info(f"Exported {conversation.message_count} messages to PostgreSQL "
2588 f"and saved metadata to {output_path}")
2589 return output_path
2590
2591 except Exception as e:
2592 self.logger.error(f"Error exporting to PostgreSQL: {e}")
2593 raise ExportError(f"Failed to export to PostgreSQL: {e}")
2594
2595 async def _export_to_database(self, conversation: SkypeConversation) -> None:
2596 """
2597 Export conversation data to PostgreSQL database.
2598
2599 Args:
2600 conversation: Conversation to export
2601 """
2602 # Use asyncio to run database operations in a thread pool
2603 loop = asyncio.get_event_loop()
2604 await loop.run_in_executor(
2605 None,
2606 self._export_conversation_sync,
2607 conversation
2608 )
2609
2610 def _export_conversation_sync(self, conversation: SkypeConversation) -> None:
2611 """Export a conversation to PostgreSQL database (synchronous)."""
2612 try:
2613 # First handle the conversation record in its own transaction
2614 with self.db_manager.session() as session:
2615 try:
2616 # Check if conversation already exists
2617 db_conversation = session.query(self.DbConversation).filter_by(
2618 id=conversation.id
2619 ).first()
2620
2621 # Create or update conversation record
2622 if not db_conversation:
2623 db_conversation = self.DbConversation(
2624 id=conversation.id,
2625 display_name=conversation.display_name,
2626 first_timestamp=conversation.first_timestamp,
2627 last_timestamp=conversation.last_timestamp,
2628 message_count=conversation.message_count,
2629 days_active=conversation.days_active,
2630 export_date=datetime.datetime.now(),
2631 metadata_json=json.dumps(conversation.original_json)
2632 if self.ctx.options.include_metadata else None
2633 )
2634 session.add(db_conversation)
2635 else:
2636 # Update existing conversation
2637 db_conversation.display_name = conversation.display_name
2638 db_conversation.first_timestamp = conversation.first_timestamp
2639 db_conversation.last_timestamp = conversation.last_timestamp
2640 db_conversation.message_count = conversation.message_count
2641 db_conversation.days_active = conversation.days_active
2642 db_conversation.export_date = datetime.datetime.now()
2643 if self.ctx.options.include_metadata:
2644 db_conversation.metadata_json = json.dumps(conversation.original_json)
2645
2646 # Process participants in the same transaction as the conversation
2647 for user_id, display_name in conversation.participants.items():
2648 # Check if participant already exists for this conversation
2649 participant = session.query(self.DbParticipant).filter_by(
2650 conversation_id=conversation.id,
2651 user_id=user_id
2652 ).first()
2653
2654 if not participant:
2655 participant = self.DbParticipant(
2656 conversation_id=conversation.id,
2657 user_id=user_id,
2658 display_name=display_name
2659 )
2660 session.add(participant)
2661 else:
2662 participant.display_name = display_name
2663
2664 # Commit conversation and participants
2665 session.commit()
2666 self.logger.debug(f"Saved conversation record for {conversation.id}")
2667
2668 except Exception as e:
2669 session.rollback()
2670 self.logger.error(f"Failed to save conversation record: {e}")
2671 # Re-raise to abort the whole export for this conversation
2672 raise
2673
2674 # Process messages in batches with separate transactions
2675 batch_size = self.ctx.options.batch_size
2676 total_messages = len(conversation.messages)
2677 successful_messages = 0
2678 failed_batches = 0
2679
2680 # Process messages in batches
2681 for i in range(0, len(conversation.messages), batch_size):
2682 batch = conversation.messages[i:i+batch_size]
2683
2684 # Create a new session for each batch
2685 with self.db_manager.session() as session:
2686 try:
2687 for message in batch:
2688 # Check if message already exists
2689 existing_message = session.query(self.DbMessage).filter_by(
2690 id=message.id
2691 ).first()
2692
2693 if not existing_message:
2694 # Create new message record
2695 db_message = self.DbMessage(
2696 id=message.id,
2697 conversation_id=conversation.id,
2698 timestamp=message.timestamp,
2699 sender_id=message.sender_id,
2700 sender_display_name=message.sender_display_name,
2701 content=message.content,
2702 message_type=message.message_type,
2703 edited=message.edited,
2704 metadata_json=json.dumps(message.original_json)
2705 if self.ctx.options.include_metadata else None
2706 )
2707 session.add(db_message)
2708
2709 # Commit this batch
2710 session.commit()
2711 successful_messages += len(batch)
2712 self.logger.debug(f"Processed message batch {i//batch_size + 1}/{(total_messages-1)//batch_size + 1} "
2713 f"({len(batch)} messages)")
2714
2715 except Exception as e:
2716 session.rollback()
2717 failed_batches += 1
2718 self.logger.error(f"Failed to process message batch {i//batch_size + 1}: {e}")
2719 # Continue with next batch instead of aborting all
2720
2721 # Check memory after each batch
2722 if self.ctx.check_memory():
2723 self.logger.debug("Memory optimization performed between batches")
2724
2725 # Log summary
2726 if failed_batches > 0:
2727 self.logger.warning(f"Conversation {conversation.id} export completed with {failed_batches} failed batches. "
2728 f"Successfully saved {successful_messages}/{total_messages} messages.")
2729 else:
2730 self.logger.info(f"Successfully exported conversation {conversation.id} "
2731 f"with {successful_messages} messages.")
2732
2733 except Exception as e:
2734 self.logger.error(f"Failed to export conversation {conversation.id}: {e}")
2735 raise
2736
2737 def get_sanitized_connection_string(self) -> str:
2738 """Generate SQLAlchemy connection string with password masked for secure logging."""
2739 config = self.ctx.options.database_config
2740 # Always mask password regardless of its length
2741 return (f"{config.engine}://{config.username}:****@"
2742 f"{config.host}:{config.port}/{config.database}")
2743
2744class ExportManager:
2745 """Manages the export process for all conversation formats."""
2746
2747 def __init__(self, ctx: AppContext):
2748 """
2749 Initialize the export manager.
2750
2751 Args:
2752 ctx: Application context
2753 """
2754 self.ctx = ctx
2755 self.logger = get_logger('export_manager', ctx)
2756
2757 # Create exporters
2758 self.exporters = {
2759 OutputFormat.TEXT: TextExporter(ctx),
2760 OutputFormat.HTML: HtmlExporter(ctx),
2761 OutputFormat.MARKDOWN: MarkdownExporter(ctx),
2762 OutputFormat.JSON: JsonExporter(ctx),
2763 OutputFormat.POSTGRESQL: PostgreSQLExporter(ctx)
2764 }
2765
2766 async def export_conversations(self, skype_export: SkypeExport,
2767 conversations: List[SkypeConversation] = None) -> Dict[str, List[Path]]:
2768 """
2769 Export selected conversations in specified formats.
2770
2771 Args:
2772 skype_export: Complete Skype export data
2773 conversations: Optional list of conversations to export (all if None)
2774
2775 Returns:
2776 Dictionary mapping format names to lists of exported file paths
2777 """
2778 self.logger.info("Starting export process...")
2779
2780 # Use all conversations if none specified
2781 if conversations is None:
2782 conversations = list(skype_export.conversations.values())
2783
2784 # Filter conversations if pattern specified
2785 if self.ctx.options.filter_pattern:
2786 pattern = self.ctx.options.filter_pattern
2787 filtered = [
2788 c for c in conversations
2789 if fnmatch.fnmatch(c.display_name.lower(), pattern.lower())
2790 ]
2791
2792 if not filtered:
2793 self.logger.warning(f"No conversations matched pattern '{pattern}'")
2794 if not self.ctx.options.basic_mode:
2795 self.logger.info("Available conversations:")
2796 for conv in conversations[:10]:
2797 self.logger.info(f"- {conv.display_name}")
2798 if len(conversations) > 10:
2799 self.logger.info(f"... and {len(conversations) - 10} more")
2800
2801 conversations = filtered
2802
2803 # Create output directory
2804 output_dir = self.ctx.options.output_dir
2805 ensure_directory(output_dir)
2806
2807 # Determine which formats to export
2808 formats = [self.ctx.options.format]
2809 if self.ctx.options.format == OutputFormat.ALL:
2810 formats = [f for f in OutputFormat if f != OutputFormat.ALL]
2811
2812 # Create format-specific directories
2813 format_dirs = {}
2814 for format in formats:
2815 format_name = format.name.lower()
2816 format_dir = output_dir / format_name
2817 ensure_directory(format_dir)
2818 format_dirs[format] = format_dir
2819
2820 # Track exported files
2821 exported_files = {format.name: [] for format in formats}
2822
2823 # Create progress bar if available
2824 progress_tracker = self.ctx.progress_tracker
2825 total_exports = len(conversations) * len(formats)
2826
2827 if RICH_AVAILABLE and progress_tracker and not self.ctx.options.basic_mode:
2828 with progress_tracker as progress:
2829 task = progress.add_task("[green]Exporting conversations...", total=total_exports)
2830
2831 # Export each conversation in each format
2832 for conversation in conversations:
2833 for format in formats:
2834 if self.ctx.cancel_requested:
2835 self.logger.info("Export cancelled by user")
2836 return exported_files
2837
2838 exported_file = await self._export_conversation(
2839 conversation, format, format_dirs[format]
2840 )
2841 exported_files[format.name].append(exported_file)
2842 progress.update(task, advance=1)
2843
2844 # Periodically check memory usage
2845 self.ctx.check_memory()
2846 else:
2847 # Simple progress tracking
2848 processed = 0
2849
2850 # Export each conversation in each format
2851 for conversation in conversations:
2852 for format in formats:
2853 if self.ctx.cancel_requested:
2854 self.logger.info("Export cancelled by user")
2855 return exported_files
2856
2857 processed += 1
2858 if processed % 5 == 0 or processed == total_exports:
2859 self.logger.info(f"Export progress: {processed}/{total_exports}")
2860
2861 exported_file = await self._export_conversation(
2862 conversation, format, format_dirs[format]
2863 )
2864 exported_files[format.name].append(exported_file)
2865
2866 # Periodically check memory usage
2867 self.ctx.check_memory()
2868
2869 # Create stats file if requested
2870 if self.ctx.options.include_conversation_stats:
2871 await self._export_stats(skype_export, output_dir)
2872
2873 # Compress output if requested
2874 if self.ctx.options.compress_output:
2875 await self._compress_output(output_dir)
2876
2877 return exported_files
2878
2879 async def _export_conversation(self, conversation: SkypeConversation,
2880 format: OutputFormat, output_dir: Path) -> Path:
2881 """
2882 Export a single conversation in specified format.
2883
2884 Args:
2885 conversation: Conversation to export
2886 format: Format to export in
2887 output_dir: Directory to write output to
2888
2889 Returns:
2890 Path to exported file
2891 """
2892 try:
2893 exporter = self.exporters[format]
2894 return await exporter.export_conversation(conversation, output_dir)
2895 except Exception as e:
2896 self.logger.error(f"Error exporting conversation {conversation.display_name} "
2897 f"in {format.name} format: {e}")
2898 self.ctx.errors.append({
2899 "type": "export_error",
2900 "conversation_id": conversation.id,
2901 "format": format.name,
2902 "error": str(e),
2903 "traceback": traceback.format_exc()
2904 })
2905 # Create a dummy path as fallback
2906 return output_dir / f"ERROR-{sanitize_filename(conversation.id)}.failed"
2907
2908 async def _export_stats(self, skype_export: SkypeExport, output_dir: Path) -> Path:
2909 """
2910 Export conversation statistics.
2911
2912 Args:
2913 skype_export: Complete Skype export data
2914 output_dir: Directory to write output to
2915
2916 Returns:
2917 Path to stats file
2918 """
2919 stats_file = output_dir / "conversation_stats.json"
2920 stats = skype_export.get_conversation_stats()
2921
2922 # Add export metadata
2923 stats["export_metadata"] = {
2924 "export_date": self.ctx.export_date,
2925 "export_time": self.ctx.export_time,
2926 "user_id": self.ctx.user_id,
2927 "user_display_name": self.ctx.user_display_name,
2928 "exported_formats": [f.name for f in OutputFormat if f != OutputFormat.ALL],
2929 "processed_at": datetime.datetime.now().isoformat()
2930 }
2931
2932 # Add memory usage if available
2933 memory_report = self.ctx.get_memory_report()
2934 if memory_report:
2935 stats["system_resources"] = memory_report
2936
2937 # Write stats file
2938 try:
2939 loop = asyncio.get_event_loop()
2940 await loop.run_in_executor(
2941 None,
2942 lambda: stats_file.write_text(
2943 json.dumps(stats, indent=2, ensure_ascii=False),
2944 encoding='utf-8'
2945 )
2946 )
2947
2948 self.logger.info(f"Exported conversation statistics to {stats_file}")
2949 return stats_file
2950 except Exception as e:
2951 self.logger.error(f"Error writing statistics to {stats_file}: {e}")
2952 raise FileWriteError(f"Failed to write statistics to {stats_file}: {e}")
2953
2954 async def _compress_output(self, output_dir: Path) -> Path:
2955 """
2956 Compress output directory.
2957
2958 Args:
2959 output_dir: Directory to compress
2960
2961 Returns:
2962 Path to compressed file
2963 """
2964 timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
2965 archive_path = output_dir.with_name(f"{output_dir.name}_{timestamp}.zip")
2966
2967 self.logger.info(f"Compressing output to {archive_path}")
2968
2969 # Create zip archive in executor to avoid blocking
2970 try:
2971 loop = asyncio.get_event_loop()
2972 await loop.run_in_executor(
2973 None,
2974 self._create_zip_archive,
2975 output_dir,
2976 archive_path
2977 )
2978
2979 self.logger.info(f"Export compressed to {archive_path}")
2980 return archive_path
2981 except Exception as e:
2982 self.logger.error(f"Error compressing output: {e}")
2983 raise FileWriteError(f"Failed to compress output: {e}")
2984
2985 def _create_zip_archive(self, source_dir: Path, output_path: Path) -> None:
2986 """
2987 Create a ZIP archive of a directory.
2988
2989 Args:
2990 source_dir: Directory to compress
2991 output_path: Path for output ZIP file
2992 """
2993 with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
2994 for root, _, files in os.walk(source_dir):
2995 for file in files:
2996 file_path = Path(root) / file
2997 # Make path relative to source_dir
2998 rel_path = file_path.relative_to(source_dir)
2999 zipf.write(file_path, arcname=rel_path)
3000
3001# ═════════════════════════════════════════════════════════════════════════════
3002# ═══════════════════════════ BASIC MODE HANDLER ═══════════════════════════════
3003# ═════════════════════════════════════════════════════════════════════════════
3004
3005class BasicModeHandler:
3006 """
3007 Handles simplified workflow for basic mode operation.
3008
3009 This class provides a straightforward procedural flow for users
3010 who prefer simplicity over advanced features.
3011 """
3012
3013 def __init__(self, ctx: AppContext):
3014 """
3015 Initialize basic mode handler.
3016
3017 Args:
3018 ctx: Application context
3019 """
3020 self.ctx = ctx
3021 self.logger = get_logger('basic_mode', ctx)
3022
3023 async def run(self, file_path: Path) -> int:
3024 """
3025 Run the basic mode workflow.
3026
3027 Args:
3028 file_path: Path to Skype export file
3029
3030 Returns:
3031 Exit code
3032 """
3033 # Simple welcome message
3034 print("\n" + "=" * 60)
3035 print(" SkypeExporter - Basic Mode")
3036 print(" Simple Skype Chat Exporter")
3037 print("=" * 60 + "\n")
3038
3039 try:
3040 # Get user display name
3041 user_display_name = input("\nPlease enter your name as you want it to appear in the logs: ")
3042 while not user_display_name.strip():
3043 user_display_name = input("Name cannot be empty. Please enter your name: ")
3044
3045 self.ctx.user_display_name = user_display_name
3046 print(f"\nWelcome, {user_display_name}!")
3047
3048 # Process input file
3049 print(f"\nReading Skype export file: {file_path}")
3050 file_reader = FileReader.create_reader(file_path)
3051 raw_data = await file_reader.read(file_path, self.ctx)
3052
3053 # Parse data
3054 print("\nParsing Skype conversations...")
3055 parser = SkypeExportParser(self.ctx)
3056 skype_export = await parser.parse(raw_data)
3057
3058 # Show available conversations
3059 conversations = list(skype_export.conversations.values())
3060 valid_conversations = [c for c in conversations if c.messages]
3061
3062 if not valid_conversations:
3063 print("\nNo conversations with messages found in the export.")
3064 return 0
3065
3066 print(f"\nFound {len(valid_conversations)} conversations in the export file.")
3067
3068 # Choose export format
3069 print("\nAvailable export formats:")
3070 print("1. Text (.txt)")
3071 print("2. HTML (.html)")
3072 print("3. Markdown (.md)")
3073 print("4. JSON (.json)")
3074 print("5. All formats")
3075
3076 format_choice = input("\nChoose format (1-5): ").strip()
3077 while format_choice not in ["1", "2", "3", "4", "5"]:
3078 format_choice = input("Please enter a number between 1 and 5: ").strip()
3079
3080 format_map = {
3081 "1": OutputFormat.TEXT,
3082 "2": OutputFormat.HTML,
3083 "3": OutputFormat.MARKDOWN,
3084 "4": OutputFormat.JSON,
3085 "5": OutputFormat.ALL
3086 }
3087
3088 self.ctx.options.format = format_map[format_choice]
3089
3090 # Choose conversations
3091 print("\nDo you want to:")
3092 print("1. Export all conversations")
3093 print("2. Select specific conversations")
3094
3095 selection_choice = input("\nChoose option (1-2): ").strip()
3096 while selection_choice not in ["1", "2"]:
3097 selection_choice = input("Please enter either 1 or 2: ").strip()
3098
3099 selected_conversations = None
3100 if selection_choice == "2":
3101 selected_conversations = await self._select_conversations(valid_conversations)
3102
3103 if not selected_conversations:
3104 print("\nNo conversations selected, nothing to export.")
3105 return 0
3106
3107 # Choose output directory
3108 default_output_dir = self.ctx.options.output_dir
3109 output_dir = input(f"\nOutput directory [default: {default_output_dir}]: ").strip()
3110 if not output_dir:
3111 output_dir = default_output_dir
3112
3113 self.ctx.options.output_dir = Path(output_dir)
3114
3115 # Advanced options
3116 include_timestamps = input("\nInclude timestamps? (y/n) [default: y]: ").strip().lower()
3117 self.ctx.options.include_timestamps = include_timestamps != "n"
3118
3119 local_time = input("Use local time instead of UTC? (y/n) [default: y]: ").strip().lower()
3120 self.ctx.options.use_local_time = local_time != "n"
3121
3122 compress_output = input("Compress output to zip? (y/n) [default: n]: ").strip().lower()
3123 self.ctx.options.compress_output = compress_output == "y"
3124
3125 # Export conversations
3126 print("\nStarting export process...")
3127
3128 export_manager = ExportManager(self.ctx)
3129 exported_files = await export_manager.export_conversations(
3130 skype_export, selected_conversations
3131 )
3132
3133 # Display summary
3134 total_conversations = sum(len(files) for files in exported_files.values())
3135 output_path = self.ctx.options.output_dir
3136
3137 print("\n" + "=" * 60)
3138 print(" Export Summary")
3139 print("-" * 60)
3140 print(f"Total conversations: {len(valid_conversations)}")
3141 print(f"Exported conversations: {total_conversations}")
3142
3143 for format_name, files in exported_files.items():
3144 if files:
3145 print(f"{format_name} files: {len(files)}")
3146
3147 print(f"Output directory: {output_path}")
3148
3149 if self.ctx.errors:
3150 print(f"\nErrors: {len(self.ctx.errors)}")
3151 for i, error in enumerate(self.ctx.errors, 1):
3152 print(f" {i}. {error['type']} - {error['error']}")
3153
3154 print("\nExport completed successfully!")
3155 print(f"Files saved to: {output_path}")
3156
3157 return 0
3158
3159 except Exception as e:
3160 print(f"\nError: {e}")
3161 return 1
3162
3163 async def _select_conversations(self, conversations: List[SkypeConversation]) -> List[SkypeConversation]:
3164 """
3165 Allow user to select conversations in basic mode.
3166
3167 Args:
3168 conversations: List of valid conversations
3169
3170 Returns:
3171 List of selected conversations
3172 """
3173 print("\nAvailable conversations:")
3174 for i, conv in enumerate(conversations, 1):
3175 message_count = conv.message_count
3176 first_date = conv.first_timestamp.strftime("%Y-%m-%d") if conv.first_timestamp else "N/A"
3177 print(f"{i:3}. {conv.display_name} ({message_count} messages, since {first_date})")
3178
3179 print("\nEnter conversation numbers to export, separated by spaces.")
3180 print("For example: '1 3 5' will export the first, third, and fifth conversations.")
3181 print("Enter 'all' to export all conversations.")
3182
3183 selection = input("\nSelection: ").strip()
3184
3185 if selection.lower() == 'all':
3186 return conversations
3187
3188 try:
3189 indices = [int(idx.strip()) for idx in selection.split() if idx.strip()]
3190
3191 # Validate indices
3192 valid_indices = [idx for idx in indices if 1 <= idx <= len(conversations)]
3193
3194 if not valid_indices:
3195 print("No valid selection made. Please try again.")
3196 return await self._select_conversations(conversations)
3197
3198 # Get selected conversations
3199 selected = [conversations[idx-1] for idx in valid_indices]
3200
3201 # Confirm selection
3202 print(f"\nYou selected {len(selected)} conversations:")
3203 for conv in selected:
3204 print(f"- {conv.display_name}")
3205
3206 confirm = input("\nConfirm selection? (y/n) [default: y]: ").strip().lower()
3207 if confirm == "n":
3208 return await self._select_conversations(conversations)
3209
3210 return selected
3211
3212 except ValueError:
3213 print("Invalid selection format. Please enter numbers separated by spaces.")
3214 return await self._select_conversations(conversations)
3215
3216# ═════════════════════════════════════════════════════════════════════════════
3217# ═══════════════════════════ USER INTERFACE ═════════════════════════════════
3218# ═════════════════════════════════════════════════════════════════════════════
3219
3220class ConversationSelector:
3221 """Interactive conversation selector with rich UI if available."""
3222
3223 def __init__(self, ctx: AppContext):
3224 """
3225 Initialize the selector.
3226
3227 Args:
3228 ctx: Application context
3229 """
3230 self.ctx = ctx
3231 self.logger = get_logger('conversation_selector', ctx)
3232
3233 async def select_conversations(self, skype_export: SkypeExport) -> List[SkypeConversation]:
3234 """
3235 Allow user to select conversations to export.
3236
3237 Args:
3238 skype_export: Complete Skype export data
3239
3240 Returns:
3241 List of selected conversations
3242 """
3243 conversations = list(skype_export.conversations.values())
3244
3245 # Filter out empty conversations
3246 valid_conversations = [c for c in conversations if c.messages]
3247
3248 if not valid_conversations:
3249 self.logger.warning("No conversations with messages found")
3250 return []
3251
3252 # Sort by display name
3253 valid_conversations.sort(key=lambda c: c.display_name.lower())
3254
3255 # Use rich UI if available
3256 if RICH_AVAILABLE:
3257 return await self._rich_select_conversations(valid_conversations)
3258 else:
3259 return await self._text_select_conversations(valid_conversations)
3260
3261 async def _rich_select_conversations(self, conversations: List[SkypeConversation]) -> List[SkypeConversation]:
3262 """
3263 Select conversations using rich UI.
3264
3265 Args:
3266 conversations: Available conversations
3267
3268 Returns:
3269 List of selected conversations
3270 """
3271 # Create table of conversations
3272 table = Table(title="Available Conversations")
3273 table.add_column("#", justify="right")
3274 table.add_column("Name", style="cyan")
3275 table.add_column("Messages", justify="right")
3276 table.add_column("First Message", justify="right")
3277 table.add_column("Last Message", justify="right")
3278
3279 # Add rows
3280 for i, conv in enumerate(conversations, 1):
3281 table.add_row(
3282 str(i),
3283 conv.display_name,
3284 str(conv.message_count),
3285 conv.first_timestamp.strftime("%Y-%m-%d") if conv.first_timestamp else "N/A",
3286 conv.last_timestamp.strftime("%Y-%m-%d") if conv.last_timestamp else "N/A"
3287 )
3288
3289 # Display table
3290 self.ctx.console.print(table)
3291 self.ctx.console.print("\nEnter the numbers of conversations to export, separated by spaces.")
3292 self.ctx.console.print("Enter 'all' to export all conversations.")
3293
3294 # Get user selection
3295 selection = await self._get_user_input("\nSelection: ")
3296
3297 if selection.lower() == 'all':
3298 return conversations
3299
3300 # Parse selection
3301 try:
3302 indices = [int(idx.strip()) for idx in selection.split() if idx.strip()]
3303
3304 # Validate indices
3305 valid_indices = [idx for idx in indices if 1 <= idx <= len(conversations)]
3306 if not valid_indices:
3307 self.ctx.console.print("[bold red]No valid selection made[/bold red]")
3308 return []
3309
3310 # Get selected conversations
3311 selected = [conversations[idx-1] for idx in valid_indices]
3312
3313 # Confirm selection
3314 self.ctx.console.print(f"\nYou selected [cyan]{len(selected)}[/cyan] conversations:")
3315 for conv in selected[:5]:
3316 self.ctx.console.print(f"- {conv.display_name}")
3317
3318 if len(selected) > 5:
3319 self.ctx.console.print(f"- ... and {len(selected) - 5} more")
3320
3321 confirm = Confirm.ask("Confirm this selection?", default=True)
3322 if not confirm:
3323 return await self._rich_select_conversations(conversations)
3324
3325 return selected
3326
3327 except ValueError:
3328 self.ctx.console.print("[bold red]Invalid selection format[/bold red]")
3329 return []
3330
3331 async def _text_select_conversations(self, conversations: List[SkypeConversation]) -> List[SkypeConversation]:
3332 """
3333 Select conversations using text UI.
3334
3335 Args:
3336 conversations: Available conversations
3337
3338 Returns:
3339 List of selected conversations
3340 """
3341 print("\nYou have conversations with the following:")
3342 print("--------------------------------------------")
3343
3344 for i, conv in enumerate(conversations, 1):
3345 first_date = "N/A"
3346 if conv.first_timestamp:
3347 first_date = conv.first_timestamp.strftime("%Y-%m-%d")
3348
3349 print(f"{i:3} -> {conv.display_name} ({conv.message_count} messages, since {first_date})")
3350
3351 print("\nEnter the numbers of conversations to export, separated by spaces.")
3352 print("Enter 'all' to export all conversations.")
3353
3354 # Get user selection
3355 selection = await self._get_user_input("\nSelection: ")
3356
3357 if selection.lower() == 'all':
3358 return conversations
3359
3360 # Parse selection
3361 try:
3362 indices = [int(idx.strip()) for idx in selection.split() if idx.strip()]
3363
3364 # Validate indices
3365 valid_indices = [idx for idx in indices if 1 <= idx <= len(conversations)]
3366 if not valid_indices:
3367 print("No valid selection made")
3368 return []
3369
3370 # Get selected conversations
3371 selected = [conversations[idx-1] for idx in valid_indices]
3372
3373 # Confirm selection
3374 print(f"\nYou selected {len(selected)} conversations:")
3375 for conv in selected[:5]:
3376 print(f"- {conv.display_name}")
3377
3378 if len(selected) > 5:
3379 print(f"- ... and {len(selected) - 5} more")
3380
3381 confirm = input("\nConfirm this selection? (y/n) [default: y]: ").strip().lower()
3382 if confirm == "n":
3383 return await self._text_select_conversations(conversations)
3384
3385 return selected
3386
3387 except ValueError:
3388 print("Invalid selection format")
3389 return []
3390
3391 async def _get_user_input(self, prompt: str) -> str:
3392 """
3393 Get user input asynchronously.
3394
3395 Args:
3396 prompt: Prompt text
3397
3398 Returns:
3399 User input string
3400 """
3401 loop = asyncio.get_event_loop()
3402 return await loop.run_in_executor(None, input, prompt)
3403
3404class UserInterface:
3405 """Main user interface handling interaction and display."""
3406
3407 def __init__(self, ctx: AppContext):
3408 """
3409 Initialize the UI.
3410
3411 Args:
3412 ctx: Application context
3413 """
3414 self.ctx = ctx
3415 self.logger = get_logger('ui', ctx)
3416
3417 async def get_user_display_name(self) -> str:
3418 """
3419 Get display name from user with enhanced validation.
3420
3421 Returns:
3422 User display name
3423 """
3424 # Use rich UI if available
3425 if RICH_AVAILABLE:
3426 self.ctx.console.print("\n[bold cyan]Please enter your display name for the logs:[/bold cyan]")
3427 display_name = await self._get_user_input("")
3428 else:
3429 display_name = await self._get_user_input("\nIn the logs, your name should be displayed as: ")
3430
3431 # Validate input
3432 while not display_name.strip():
3433 if RICH_AVAILABLE:
3434 self.ctx.console.print("[bold red]Display name cannot be empty![/bold red]")
3435 display_name = await self._get_user_input("Please enter how you want your name to be displayed: ")
3436 else:
3437 display_name = await self._get_user_input("\nPlease enter how you want your name to be displayed: ")
3438
3439 # Additional validation for unusually long names
3440 if len(display_name) > 50:
3441 warning = "Your display name is unusually long. Are you sure you want to use this name?"
3442
3443 if RICH_AVAILABLE:
3444 self.ctx.console.print(f"[bold yellow]{warning}[/bold yellow]")
3445 confirm = Confirm.ask("Continue with this name?", default=True)
3446 if not confirm:
3447 return await self.get_user_display_name()
3448 else:
3449 print(f"\nWarning: {warning}")
3450 confirm = input("Continue with this name? (y/n) [default: y]: ").strip().lower()
3451 if confirm == "n":
3452 return await self.get_user_display_name()
3453
3454 return display_name
3455
3456 def display_welcome(self) -> None:
3457 """Display welcome message with app info."""
3458 if RICH_AVAILABLE:
3459 # Create fancy header
3460 self.ctx.console.print("\n[bold blue]╔═══════════════════════════════════════════════════════════╗[/bold blue]")
3461 self.ctx.console.print("[bold blue]║[/bold blue] [bold cyan]SkypeExporter v2.0.0[/bold cyan] [bold blue]║[/bold blue]")
3462 self.ctx.console.print("[bold blue]║[/bold blue] [italic]Enterprise-Grade Skype Chat Parser[/italic] [bold blue]║[/bold blue]")
3463 self.ctx.console.print("[bold blue]╚═══════════════════════════════════════════════════════════╝[/bold blue]\n")
3464
3465 # Show system info
3466 self.ctx.console.print("[bold]System Information:[/bold]")
3467 self.ctx.console.print(f" Python: {platform.python_version()}")
3468 self.ctx.console.print(f" Platform: {platform.system()} {platform.release()}")
3469
3470 # Show memory info if available
3471 if self.ctx.memory_monitor:
3472 mem_usage = self.ctx.memory_monitor.get_memory_usage_mb()
3473 mem_percent = self.ctx.memory_monitor.get_memory_percent()
3474 sys_memory = self.ctx.memory_monitor.get_system_memory_mb()
3475
3476 self.ctx.console.print(f" Memory: {mem_usage:.1f} MB / {sys_memory:.1f} MB ({mem_percent:.1f}%)")
3477
3478 # Show dependency status
3479 self.ctx.console.print("\n[bold]Dependency Status:[/bold]")
3480 dep_status = check_dependencies()
3481 for pkg, status in dep_status.items():
3482 color = "green" if status else "red"
3483 symbol = "✓" if status else "✗"
3484 self.ctx.console.print(f" [{color}]{symbol}[/{color}] {pkg}")
3485
3486 # Show mode info
3487 mode = "[bold cyan]Basic Mode[/bold cyan]" if self.ctx.options.basic_mode else "[bold green]Advanced Mode[/bold green]"
3488 self.ctx.console.print(f"\nRunning in {mode}")
3489
3490 self.ctx.console.print("\n[italic]Starting export process...[/italic]\n")
3491
3492 else:
3493 # Simple text header
3494 print("\n" + "=" * 60)
3495 print(" SkypeExporter v2.0.0")
3496 print(" Enterprise-Grade Skype Chat Parser")
3497 print("=" * 60 + "\n")
3498
3499 # Show system info
3500 print(f"Python: {platform.python_version()}")
3501 print(f"Platform: {platform.system()} {platform.release()}")
3502
3503 # Show memory info if available
3504 if self.ctx.memory_monitor:
3505 mem_usage = self.ctx.memory_monitor.get_memory_usage_mb()
3506 mem_percent = self.ctx.memory_monitor.get_memory_percent()
3507 sys_memory = self.ctx.memory_monitor.get_system_memory_mb()
3508
3509 print(f"Memory: {mem_usage:.1f} MB / {sys_memory:.1f} MB ({mem_percent:.1f}%)")
3510
3511 # Show dependency status
3512 print("\nDependency Status:")
3513 dep_status = check_dependencies()
3514 for pkg, status in dep_status.items():
3515 symbol = "✓" if status else "✗"
3516 print(f" {symbol} {pkg}")
3517
3518 # Show mode info
3519 mode = "Basic Mode" if self.ctx.options.basic_mode else "Advanced Mode"
3520 print(f"\nRunning in {mode}")
3521
3522 print("\nStarting export process...\n")
3523
3524 def display_summary(self, skype_export: SkypeExport, exported_files: Dict[str, List[Path]]) -> None:
3525 """
3526 Display export summary.
3527
3528 Args:
3529 skype_export: Complete Skype export data
3530 exported_files: Dictionary of exported files by format
3531 """
3532 total_conversations = sum(len(files) for files in exported_files.values())
3533 elapsed_time = time.time() - self.ctx.start_time
3534 output_dir = self.ctx.options.output_dir
3535
3536 if RICH_AVAILABLE and not self.ctx.options.basic_mode:
3537 # Create summary panel
3538 summary = Table(title="Export Summary", show_header=False, box=None)
3539 summary.add_column("", style="bold cyan")
3540 summary.add_column("")
3541
3542 summary.add_row("Total conversations:", str(skype_export.total_conversations))
3543 summary.add_row("Total messages:", str(skype_export.total_messages))
3544 summary.add_row("Exported conversations:", str(total_conversations))
3545
3546 # Add export formats
3547 for format_name, files in exported_files.items():
3548 if files:
3549 summary.add_row(f"{format_name} files:", str(len(files)))
3550
3551 summary.add_row("Output directory:", str(output_dir))
3552 summary.add_row("Processing time:", f"{elapsed_time:.2f} seconds")
3553
3554 # Add memory usage if available
3555 memory_report = self.ctx.get_memory_report()
3556 if memory_report:
3557 peak_mb = memory_report.get("peak_usage_mb", 0)
3558 summary.add_row("Peak memory usage:", f"{peak_mb:.2f} MB")
3559
3560 if self.ctx.errors:
3561 summary.add_row("Errors:", f"[bold red]{len(self.ctx.errors)}[/bold red]")
3562
3563 # Display summary in panel
3564 panel = Panel(summary, title="SkypeExporter Completed", border_style="green")
3565 self.ctx.console.print(panel)
3566
3567 # Show errors if any
3568 if self.ctx.errors:
3569 self.ctx.console.print("\n[bold red]Errors encountered:[/bold red]")
3570 for i, error in enumerate(self.ctx.errors, 1):
3571 self.ctx.console.print(f" {i}. {error['type']} - {error['error']}")
3572
3573 self.ctx.console.print("\n[bold green]Export completed successfully![/bold green]")
3574 self.ctx.console.print(f"Files saved to: [cyan]{output_dir}[/cyan]")
3575
3576 else:
3577 # Simple text summary
3578 print("\n" + "=" * 60)
3579 print(" Export Summary")
3580 print("-" * 60)
3581 print(f"Total conversations: {skype_export.total_conversations}")
3582 print(f"Total messages: {skype_export.total_messages}")
3583 print(f"Exported conversations: {total_conversations}")
3584
3585 # Add export formats
3586 for format_name, files in exported_files.items():
3587 if files:
3588 print(f"{format_name} files: {len(files)}")
3589
3590 print(f"Output directory: {output_dir}")
3591 print(f"Processing time: {elapsed_time:.2f} seconds")
3592
3593 # Add memory usage if available
3594 memory_report = self.ctx.get_memory_report()
3595 if memory_report:
3596 peak_mb = memory_report.get("peak_usage_mb", 0)
3597 print(f"Peak memory usage: {peak_mb:.2f} MB")
3598
3599 if self.ctx.errors:
3600 print(f"Errors: {len(self.ctx.errors)}")
3601
3602 print("=" * 60)
3603
3604 # Show errors if any
3605 if self.ctx.errors:
3606 print("\nErrors encountered:")
3607 for i, error in enumerate(self.ctx.errors, 1):
3608 print(f" {i}. {error['type']} - {error['error']}")
3609
3610 print("\nExport completed successfully!")
3611 print(f"Files saved to: {output_dir}")
3612
3613 async def _get_user_input(self, prompt: str) -> str:
3614 """
3615 Get user input asynchronously.
3616
3617 Args:
3618 prompt: Prompt text
3619
3620 Returns:
3621 User input string
3622 """
3623 loop = asyncio.get_event_loop()
3624 return await loop.run_in_executor(None, input, prompt)
3625
3626# ═════════════════════════════════════════════════════════════════════════════
3627# ═══════════════════════════ APPLICATION CORE ═══════════════════════════════
3628# ═════════════════════════════════════════════════════════════════════════════
3629
3630class SkypeExporterApp:
3631 """Main application class orchestrating the export process."""
3632
3633 def __init__(self):
3634 """Initialize the application."""
3635 # Parse command line arguments
3636 self.args = self._parse_args()
3637
3638 # Create app context
3639 self.ctx = AppContext(
3640 options=self._create_options(),
3641 logger=setup_logging(
3642 LogLevel.DEBUG if self.args.debug else LogLevel.INFO,
3643 log_file=Path(self.args.log_file) if self.args.log_file else None
3644 )
3645 )
3646
3647 # Create UI components
3648 self.ui = UserInterface(self.ctx)
3649 self.selector = ConversationSelector(self.ctx)
3650 self.basic_mode_handler = BasicModeHandler(self.ctx)
3651
3652 # Set up signal handlers
3653 self._setup_signal_handlers()
3654
3655 def _parse_args(self) -> argparse.Namespace:
3656 """
3657 Parse command line arguments.
3658
3659 Returns:
3660 Parsed arguments
3661 """
3662 parser = argparse.ArgumentParser(
3663 description="SkypeExporter: Enterprise-Grade Skype Chat Log Exporter",
3664 formatter_class=argparse.ArgumentDefaultsHelpFormatter
3665 )
3666
3667 parser.add_argument('filename',
3668 help='Path to the Skype export file (JSON, TAR, or ZIP)')
3669
3670 parser.add_argument('-o', '--output-dir',
3671 help='Directory to save exported files',
3672 default=os.path.join(os.getcwd(), "skype_exports"))
3673
3674 parser.add_argument('-f', '--format',
3675 choices=['text', 'html', 'markdown', 'json', 'postgresql', 'all'],
3676 default='text',
3677 help='Output format for exported conversations')
3678
3679 parser.add_argument('-c', '--choose',
3680 action='store_true',
3681 help='Choose which conversations to export')
3682
3683 parser.add_argument('-p', '--pattern',
3684 help='Filter conversations by name pattern (supports wildcards)')
3685
3686 parser.add_argument('--filter',
3687 help='Alternative name for pattern filter')
3688
3689 parser.add_argument('-a', '--anonymize',
3690 action='store_true',
3691 help='Anonymize user names in exports')
3692
3693 parser.add_argument('-s', '--stats',
3694 action='store_true',
3695 help='Include conversation statistics')
3696
3697 parser.add_argument('--no-stats',
3698 action='store_true',
3699 help='Exclude conversation statistics')
3700
3701 parser.add_argument('-t', '--timestamps',
3702 action='store_true',
3703 default=True,
3704 help='Include timestamps in exports')
3705
3706 parser.add_argument('--no-timestamps',
3707 action='store_true',
3708 help='Exclude timestamps from exports')
3709
3710 parser.add_argument('-l', '--local-time',
3711 action='store_true',
3712 help='Use local time instead of UTC')
3713
3714 parser.add_argument('--utc',
3715 action='store_true',
3716 help='Use UTC time (default)')
3717
3718 parser.add_argument('--no-parallel',
3719 action='store_true',
3720 help='Disable parallel processing')
3721
3722 parser.add_argument('--batch-size',
3723 type=int,
3724 help='Batch size for processing messages')
3725
3726 parser.add_argument('--max-workers',
3727 type=int,
3728 help='Maximum number of worker threads for parallel processing')
3729
3730 parser.add_argument('--compress',
3731 action='store_true',
3732 help='Compress output files into ZIP archive')
3733
3734 parser.add_argument('--timezone',
3735 help='Timezone for timestamps (e.g. "America/New_York")')
3736
3737 parser.add_argument('--no-pretty',
3738 action='store_true',
3739 help='Disable pretty printing for JSON output')
3740
3741 parser.add_argument('--include-metadata',
3742 action='store_true',
3743 help='Include metadata in exports')
3744
3745 parser.add_argument('--include-ids',
3746 action='store_true',
3747 help='Include message IDs in exports')
3748
3749 parser.add_argument('--include-html',
3750 action='store_true',
3751 help='Include HTML in exports')
3752
3753 parser.add_argument('--media-links',
3754 action='store_true',
3755 help='Include media links in exports')
3756
3757 parser.add_argument('--date-from',
3758 help='Start date for filtering messages (YYYY-MM-DD format)')
3759
3760 parser.add_argument('--date-to',
3761 help='End date for filtering messages (YYYY-MM-DD format)')
3762
3763 parser.add_argument('--debug',
3764 action='store_true',
3765 help='Enable debug logging')
3766
3767 parser.add_argument('--log-file',
3768 help='Path to log file')
3769
3770 parser.add_argument('--basic',
3771 action='store_true',
3772 help='Use basic mode with simplified interaction')
3773
3774 parser.add_argument('--memory-profile',
3775 action='store_true',
3776 help='Enable memory profiling')
3777
3778 parser.add_argument('--no-memory-optimization',
3779 action='store_true',
3780 help='Disable automatic memory optimization')
3781
3782 parser.add_argument('--no-memory-opt',
3783 action='store_true',
3784 help='Alternative name for disabling memory optimization')
3785
3786 parser.add_argument('--memory-threshold',
3787 type=int,
3788 help='Memory usage threshold percentage for optimization (1-99)')
3789
3790 # PostgreSQL options
3791 db_group = parser.add_argument_group('PostgreSQL Database Options')
3792 db_group.add_argument('--db-host',
3793 help='Database host (for PostgreSQL export)',
3794 default='localhost')
3795 db_group.add_argument('--db-port',
3796 type=int,
3797 help='Database port (for PostgreSQL export)',
3798 default=5432)
3799 db_group.add_argument('--db-name',
3800 help='Database name (for PostgreSQL export)',
3801 default='skype_export')
3802 db_group.add_argument('--db-user',
3803 help='Database username (for PostgreSQL export)',
3804 default='postgres')
3805 db_group.add_argument('--db-password',
3806 help='Database password (for PostgreSQL export)',
3807 default='')
3808 db_group.add_argument('--db-engine',
3809 help='Database engine (for PostgreSQL export)',
3810 default='postgresql')
3811 db_group.add_argument('--db-schema',
3812 help='Database schema (for PostgreSQL export)',
3813 default='public')
3814 db_group.add_argument('--db-echo',
3815 action='store_true',
3816 help='Echo SQL queries (for debugging)')
3817
3818 parser.add_argument('--version',
3819 action='version',
3820 version='SkypeExporter 2.0.0')
3821
3822 return parser.parse_args()
3823
3824 def _create_options(self) -> ExportOptions:
3825 """
3826 Create export options from command line arguments.
3827
3828 Returns:
3829 Configured ExportOptions object
3830 """
3831 args = self.args
3832
3833 # Validate numeric inputs
3834 try:
3835 if args.batch_size is not None:
3836 args.batch_size = int(args.batch_size)
3837 if args.batch_size <= 0:
3838 raise ConfigError("Batch size must be a positive integer")
3839
3840 if args.max_workers is not None:
3841 args.max_workers = int(args.max_workers)
3842 if args.max_workers < 1:
3843 raise ConfigError("Max workers must be at least 1")
3844
3845 if args.memory_threshold is not None:
3846 args.memory_threshold = int(args.memory_threshold)
3847 if not (1 <= args.memory_threshold <= 99):
3848 raise ConfigError("Memory threshold must be between 1 and 99 percent")
3849 except ValueError:
3850 raise ConfigError("Numeric parameters must be valid integers")
3851
3852 # Create output directory
3853 output_dir = Path(args.output_dir if args.output_dir else DEFAULT_OUTPUT_DIR)
3854
3855 # Validate output directory
3856 if not output_dir.parent.exists():
3857 raise ConfigError(f"Parent directory does not exist: {output_dir.parent}")
3858
3859 # Create database configuration if needed
3860 if args.format == 'postgresql' or args.format == 'all':
3861 db_config = DatabaseConfig(
3862 engine=args.db_engine,
3863 host=args.db_host,
3864 port=int(args.db_port),
3865 database=args.db_name,
3866 username=args.db_user,
3867 password=args.db_password,
3868 schema=args.db_schema,
3869 echo_sql=args.db_echo
3870 )
3871 else:
3872 db_config = DatabaseConfig()
3873
3874 # Handle date range if specified
3875 date_range = None
3876 if args.date_from and args.date_to:
3877 try:
3878 date_from = datetime.datetime.strptime(args.date_from, '%Y-%m-%d').date()
3879 date_to = datetime.datetime.strptime(args.date_to, '%Y-%m-%d').date()
3880 date_range = (date_from, date_to)
3881 except ValueError:
3882 raise ConfigError("Date range must be in YYYY-MM-DD format")
3883
3884 # Determine output format
3885 format_str = args.format.lower() if args.format else 'text'
3886 try:
3887 output_format = {
3888 'text': OutputFormat.TEXT,
3889 'html': OutputFormat.HTML,
3890 'markdown': OutputFormat.MARKDOWN,
3891 'json': OutputFormat.JSON,
3892 'postgresql': OutputFormat.POSTGRESQL,
3893 'all': OutputFormat.ALL
3894 }[format_str]
3895 except KeyError:
3896 raise ConfigError(f"Invalid output format: {format_str}")
3897
3898 # Build options object
3899 options = ExportOptions(
3900 output_dir=output_dir,
3901 format=output_format,
3902 anonymize=args.anonymize,
3903 include_timestamps=not args.no_timestamps,
3904 use_local_time=not args.utc,
3905 include_metadata=args.include_metadata,
3906 include_message_ids=args.include_ids,
3907 parallel=not args.no_parallel,
3908 max_workers=args.max_workers or max(1, os.cpu_count() or 4),
3909 batch_size=args.batch_size or 1000,
3910 timezone=args.timezone,
3911 pretty_print=not args.no_pretty,
3912 compress_output=args.compress,
3913 filter_pattern=args.filter,
3914 date_range=date_range,
3915 include_conversation_stats=not args.no_stats,
3916 media_links=args.media_links,
3917 strip_html=not args.include_html,
3918 debug_mode=args.debug,
3919 basic_mode=args.basic,
3920 enable_memory_optimization=not args.no_memory_opt,
3921 memory_profile=args.memory_profile,
3922 memory_threshold_percent=args.memory_threshold or 75,
3923 database_config=db_config
3924 )
3925
3926 return options
3927
3928 def _setup_signal_handlers(self) -> None:
3929 """Set up handlers for system signals."""
3930 # Handle SIGINT (Ctrl+C)
3931 if hasattr(signal, 'SIGINT'):
3932 signal.signal(signal.SIGINT, self._signal_handler)
3933
3934 # Handle SIGTERM
3935 if hasattr(signal, 'SIGTERM'):
3936 signal.signal(signal.SIGTERM, self._signal_handler)
3937
3938 def _signal_handler(self, sig, frame) -> None:
3939 """
3940 Handle system signals to allow graceful shutdown.
3941
3942 Args:
3943 sig: Signal number
3944 frame: Current stack frame
3945 """
3946 self.ctx.logger.info(f"Received signal {sig}, shutting down gracefully...")
3947 self.ctx.cancel_requested = True
3948
3949 async def run(self) -> int:
3950 """
3951 Run the application.
3952
3953 Returns:
3954 Exit code (0 for success, non-zero for error)
3955 """
3956 try:
3957 # Run in basic mode if requested
3958 if self.ctx.options.basic_mode:
3959 input_path = Path(self.args.filename)
3960 if not input_path.exists():
3961 print(f"Error: Input file not found: {input_path}")
3962 return 1
3963
3964 return await self.basic_mode_handler.run(input_path)
3965
3966 # Standard advanced mode
3967 # Display welcome message
3968 self.ui.display_welcome()
3969
3970 # Check dependencies
3971 dependency_status = check_dependencies()
3972 missing_deps = [pkg for pkg, status in dependency_status.items() if not status]
3973
3974 if missing_deps:
3975 self.ctx.logger.warning(f"Missing dependencies: {', '.join(missing_deps)}")
3976
3977 # Try to install missing dependencies
3978 if self.ctx.options.format != OutputFormat.TEXT:
3979 # Check if required deps for the selected format are missing
3980 format_deps = {
3981 OutputFormat.HTML: ['jinja2'],
3982 OutputFormat.MARKDOWN: ['markdown'],
3983 OutputFormat.POSTGRESQL: ['sqlalchemy', 'psycopg2-binary']
3984 }
3985
3986 required_for_format = format_deps.get(self.ctx.options.format, [])
3987 missing_required = [d for d in required_for_format if d in missing_deps]
3988
3989 if missing_required:
3990 self.ctx.logger.info("Attempting to install missing dependencies required for "
3991 f"{self.ctx.options.format.name} format...")
3992 install_dependencies()
3993
3994 # Get user display name
3995 self.ctx.user_display_name = await self.ui.get_user_display_name()
3996
3997 # Process input file
3998 input_path = Path(self.args.filename)
3999 if not input_path.exists():
4000 self.ctx.logger.error(f"Input file not found: {input_path}")
4001 return 1
4002
4003 # Create appropriate reader and read file
4004 reader = FileReader.create_reader(input_path)
4005 raw_data = await reader.read(input_path, self.ctx)
4006
4007 # Parse the export data
4008 parser = SkypeExportParser(self.ctx)
4009 skype_export = await parser.parse(raw_data)
4010
4011 # Select conversations to export
4012 selected_conversations = None
4013 if self.args.choose:
4014 selected_conversations = await self.selector.select_conversations(skype_export)
4015
4016 if not selected_conversations:
4017 self.ctx.logger.warning("No conversations selected, nothing to export")
4018 return 0
4019
4020 # Export selected conversations
4021 export_manager = ExportManager(self.ctx)
4022 exported_files = await export_manager.export_conversations(
4023 skype_export, selected_conversations
4024 )
4025
4026 # Display summary
4027 self.ui.display_summary(skype_export, exported_files)
4028
4029 return 0
4030
4031 except Exception as e:
4032 if self.ctx.options.basic_mode:
4033 print(f"Error: {e}")
4034 else:
4035 self.ctx.logger.error(f"Error: {e}")
4036
4037 if self.ctx.options.debug_mode:
4038 if RICH_AVAILABLE:
4039 self.ctx.console.print_exception()
4040 else:
4041 self.ctx.logger.error(traceback.format_exc())
4042 return 1
4043
4044def main() -> int:
4045 """
4046 Main entry point for the application.
4047
4048 Returns:
4049 Exit code
4050 """
4051 app = SkypeExporterApp()
4052
4053 # Get the event loop
4054 try:
4055 loop = asyncio.get_event_loop()
4056 except RuntimeError:
4057 # Create new event loop if none exists
4058 loop = asyncio.new_event_loop()
4059 asyncio.set_event_loop(loop)
4060
4061 # Run the application
4062 try:
4063 return loop.run_until_complete(app.run())
4064 except KeyboardInterrupt:
4065 print("\nOperation cancelled by user")
4066 return 130 # Standard exit code for SIGINT
4067 finally:
4068 # Clean up
4069 loop.close()
4070
4071if __name__ == "__main__":
4072 sys.exit(main())