ShisoftResearch
diff --git a/‎scripts/test_tiered_memory_recovery.sh‎
Lines changed: 231 additions & 0 deletions b/‎scripts/test_tiered_memory_recovery.sh‎
Lines changed: 231 additions & 0 deletions
diff --git a/‎src/apps/wikidata/cli.rs‎
Lines changed: 6 additions & 0 deletions b/‎src/apps/wikidata/cli.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/apps/wikidata/importer.rs‎
Lines changed: 37 additions & 13 deletions b/‎src/apps/wikidata/importer.rs‎
Lines changed: 37 additions & 13 deletions
@@ -0,0 +1,231 @@
+#!/bin/bash
+# Tiered Memory Recovery Test Script
+# 
+# This script tests system recovery when storage consumption exceeds physical memory.
+# It uses a small physical memory limit (256MB) and imports a larger dataset.
+# The embedding index is disabled to focus on storage/tiered memory behavior.
+#
+# Usage: ./scripts/test_tiered_memory_recovery.sh [max_entities]
+#   max_entities: Number of entities to import (default: 50000 - should exceed 256MB)
+
+set -e
+
+# Configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+WIKIDATA_FILE="${WIKIDATA_FILE:-/mnt/micron/wikidata20160104.json}"
+MAX_ENTITIES="${1:-50000}" # Default increased to ensure we exceed 256MB
+STORAGE_BASE="/tmp/tiered_memory_test"
+IMPORT_CONFIG_FILE="$STORAGE_BASE/import_config.yaml"
+RECOVERY_CONFIG_FILE="$STORAGE_BASE/recovery_config.yaml"
+PORT=7201 # Different port to avoid conflict
+VERIFICATION_LOG="$STORAGE_BASE/verification.log"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+log_phase() {
+    echo ""
+    echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo -e "${CYAN}  $1${NC}"
+    echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo ""
+}
+
+# Check prerequisites
+check_prerequisites() {
+    log_info "Checking prerequisites..."
+    
+    if [[ ! -f "$WIKIDATA_FILE" ]]; then
+        log_error "Wikidata dump file not found: $WIKIDATA_FILE"
+        log_info "Set WIKIDATA_FILE environment variable to the correct path"
+        exit 1
+    fi
+    
+    if ! command -v cargo &> /dev/null; then
+        log_error "cargo not found. Please install Rust."
+        exit 1
+    fi
+    
+    # Check for the binary (Debug build)
+    if [[ ! -f "$PROJECT_DIR/target/debug/wikidata_cli" ]]; then
+        log_warn "Debug binary not found at $PROJECT_DIR/target/debug/wikidata_cli"
+        log_info "Please run: cargo build --bin wikidata_cli --no-default-features --features cpu_optimized"
+        # We don't exit here, allowing the script to try using cargo run if needed or user to build
+    fi
+    
+    log_success "Prerequisites check passed"
+}
+
+# Clean up previous test data
+cleanup() {
+    log_info "Cleaning up previous test data..."
+    rm -rf "$STORAGE_BASE"
+    mkdir -p "$STORAGE_BASE/backup"
+    mkdir -p "$STORAGE_BASE/wal"
+    mkdir -p "$STORAGE_BASE/raft"
+    mkdir -p "$STORAGE_BASE/undo"
+    log_success "Cleanup complete"
+}
+
+# Create configuration files with tight memory limit
+create_configs() {
+    log_info "Creating configuration files with 256MB physical memory limit..."
+    
+    # Import config
+    cat > "$IMPORT_CONFIG_FILE" << EOF
+server_addr: 127.0.0.1:$PORT
+group_name: TieredMemoryTest
+meta_members:
+  - 127.0.0.1:$PORT
+storage:
+  chunk_count: 16
+  total_size: "64GB"
+  backup_storage: "$STORAGE_BASE/backup"
+  wal_storage: "$STORAGE_BASE/wal"
+  raft_storage: "$STORAGE_BASE/raft"
+  undo_log_storage: "$STORAGE_BASE/undo"
+  enable_recovery: false
+  services:
+    - Cell
+    - Transaction
+    - RangedIndexer
+    - Query
+  index_enabled: true
+  tiered_config:
+    threshold: 0.8
+    physical_memory_limit: "256MB"
+EOF
+
+    # Recovery config
+    cat > "$RECOVERY_CONFIG_FILE" << EOF
+server_addr: 127.0.0.1:$PORT
+group_name: TieredMemoryTest
+meta_members:
+  - 127.0.0.1:$PORT
+storage:
+  chunk_count: 16
+  total_size: "64GB"
+  backup_storage: "$STORAGE_BASE/backup"
+  wal_storage: "$STORAGE_BASE/wal"
+  raft_storage: "$STORAGE_BASE/raft"
+  undo_log_storage: "$STORAGE_BASE/undo"
+  enable_recovery: true
+  services:
+    - Cell
+    - Transaction
+    - RangedIndexer
+    - Query
+  index_enabled: true
+  tiered_config:
+    threshold: 0.8
+    physical_memory_limit: "256MB"
+EOF
+
+    log_success "Configuration files created"
+}
+
+# Phase 1: Import data
+run_import() {
+    log_phase "PHASE 1: Importing $MAX_ENTITIES entities (Memory Limit: 256MB)"
+    
+    cd "$PROJECT_DIR"
+    
+    log_info "Starting import with:"
+    log_info "  • No Embedding Indexing (--no-embedding)"
+    log_info "  • Batch size: 100"
+    log_info "  • Workers: 4"
+    log_info "  • Max entities: $MAX_ENTITIES"
+    echo ""
+    
+    # Run the import
+    # We use RUST_LOG to see memory usage logs if available, or just standard info
+    RUST_LOG=morpheus::apps::wikidata=info,info "$PROJECT_DIR/target/debug/wikidata_cli" import \
+        "$WIKIDATA_FILE" \
+        --config "$IMPORT_CONFIG_FILE" \
+        --max-entities "$MAX_ENTITIES" \
+        --batch-size 100 \
+        --workers 4 \
+        --no-embedding 2>&1 | tee "$STORAGE_BASE/import.log"
+    
+    local exit_code=$?
+    if [[ $exit_code -ne 0 ]]; then
+        log_error "Import failed with exit code $exit_code"
+        exit 1
+    fi
+    
+    log_success "Import phase completed successfully"
+    
+    # Wait a bit for wal flush
+    log_info "Waiting for WAL flush..."
+    sleep 5
+}
+
+# Phase 2: Recovery Verification
+run_recovery_test() {
+    log_phase "PHASE 2: Testing Recovery (Simulating restart)"
+    
+    cd "$PROJECT_DIR"
+    
+    log_info "Starting recovery verification..."
+    log_info "Attempting to query entities from the recovered store..."
+    
+    # We use 'import-and-query' or just 'import' script usually just does import. 
+    # To test recovery, run a query command against the RECOVERY config.
+    # But wikidata_cli 'query' command might be useful here.
+    # However, to simulate 'recovery', we just need to start the server and read data.
+    # We can use 'import-and-query' with 0 entities to just start up and query?
+    # Or better, we can use a dedicated 'query' command or 'server' command if available.
+    # Looking at wikidata_cli, it has 'Import', 'ImportAndQuery', 'Query'.
+    # 'Query' seems appropriate.
+    
+    # Let's query for a known entity (e.g., Q42) to verify it exists.
+    # Assuming Q42 is within the first 50k entities.
+    
+    # Enable pipefail to catch errors in piped commands
+    set -o pipefail
+
+    RUST_LOG=info "$PROJECT_DIR/target/debug/wikidata_cli" query \
+        --config "$RECOVERY_CONFIG_FILE" \
+        entity --entity-id "Q42" 2>&1 | tee -a "$VERIFICATION_LOG"
+
+    local exit_code=$?
+    if [[ $exit_code -ne 0 ]]; then
+        log_error "Recovery verification failed! Server might have failed to start or find data."
+        exit 1
+    fi
+    
+    log_success "Recovery verification successful!"
+}
+
+# Main execution flow
+main() {
+    check_prerequisites
+    cleanup
+    create_configs
+    run_import
+    run_recovery_test
+}
+
+main
@@ -40,6 +40,9 @@ pub enum Commands {
         /// Maximum number of entities to import (optional, imports all if not specified)
         #[arg(long)]
         max_entities: Option<usize>,
+        /// Disable embedding indexing (default: false)
+        #[arg(long, default_value = "false")]
+        no_embedding: bool,
     },
     /// Import and immediately start query mode
     ImportAndQuery {
@@ -66,6 +69,9 @@ pub enum Commands {
         /// Maximum number of entities to import (optional, imports all if not specified)
         #[arg(long)]
         max_entities: Option<usize>,
+        /// Disable embedding indexing (default: false)
+        #[arg(long, default_value = "false")]
+        no_embedding: bool,
     },
     /// Query operations
     Query {
 
@@ -53,10 +53,13 @@ pub struct ImportConfig {
     pub db_batch_size: Option<usize>, // How many items to process concurrently (default: 64)
     pub items_per_future: Option<usize>, // How many items each future handles (default: 1 = 64 concurrent futures)
     pub parse_thread_pool_size: Option<usize>, // Rayon thread pool size (default: num_cpus)
-    
+
     /// Maximum number of entities to import (None = import all)
     /// Used for testing with partial data from large datasets
     pub max_entities: Option<usize>,
+
+    /// Disable embedding indexing for description field
+    pub no_embedding: bool,
 }
 
 impl ImportConfig {
@@ -73,9 +76,10 @@ impl ImportConfig {
             batch_size,
             memory_limit_mb,
             db_batch_size: None,
-            items_per_future: None,
             parse_thread_pool_size: None,
             max_entities: None,
+            items_per_future: None,
+            no_embedding: false,
         }
     }
 
@@ -96,12 +100,18 @@ impl ImportConfig {
         self.parse_thread_pool_size = Some(threads);
         self
     }
-    
+
     /// Set maximum number of entities to import (for testing with partial data)
     pub fn with_max_entities(mut self, max: usize) -> Self {
         self.max_entities = Some(max);
         self
     }
+
+    /// Set whether to disable embedding indexing
+    pub fn with_no_embedding(mut self, no_embedding: bool) -> Self {
+        self.no_embedding = no_embedding;
+        self
+    }
 }
 
 /// Statistics from import process
@@ -345,13 +355,14 @@ impl WikidataImporter {
         // Resolve concurrency settings from config (use defaults if not specified)
         let db_batch_size = config.db_batch_size.unwrap_or(64);
         let items_per_future = config.items_per_future.unwrap_or(64);
-        
+
         // Cap parse threads at 32 even on large systems
         // CPU-bound parsing has diminishing returns beyond 32 threads due to:
         // - Cache contention
         // - Memory bandwidth saturation
         // - Scheduler overhead
-        let num_threads = config.parse_thread_pool_size
+        let num_threads = config
+            .parse_thread_pool_size
             .unwrap_or_else(|| num_cpus::get().min(32));
 
         // Create custom thread pool for parsing operations
@@ -365,7 +376,10 @@ impl WikidataImporter {
             .expect("Failed to build Rayon thread pool");
 
         println!("📊 Concurrency Settings:");
-        println!("   Parse threads: {} (capped at 32 for optimal CPU cache usage)", num_threads);
+        println!(
+            "   Parse threads: {} (capped at 32 for optimal CPU cache usage)",
+            num_threads
+        );
         println!("   Thread stack size: 8MB (prevents stack overflow on complex entities)");
         println!(
             "   DB batch size: {} (concurrent operations)",
@@ -424,7 +438,8 @@ impl WikidataImporter {
         });
 
         // Create EntityMetadataCell schema (or reuse if exists)
-        let entity_metadata_schema = create_entity_metadata_schema();
+        // Pass the negate of no_embedding to enable/disable embedding index
+        let entity_metadata_schema = create_entity_metadata_schema(!self.config.no_embedding);
         let entity_metadata_id = self
             .server
             .schema_container
@@ -576,8 +591,11 @@ impl WikidataImporter {
 
         // Ensure embedding indexes are created for entity metadata description field
         // This is needed when schemas are reused from previous imports
-        super::schema::ensure_wikidata_embedding_indexes(&self.server).await
-            .map_err(|e| ImportError::Schema(format!("Failed to ensure embedding indexes: {}", e)))?;
+        super::schema::ensure_wikidata_embedding_indexes(&self.server)
+            .await
+            .map_err(|e| {
+                ImportError::Schema(format!("Failed to ensure embedding indexes: {}", e))
+            })?;
 
         Ok(())
     }
@@ -1441,11 +1459,14 @@ impl WikidataImporter {
             line_count += 1;
             bytes_read += line.len() as u64 + 1; // +1 for newline character
             current_batch.push(line);
-            
+
             // Check if we've reached the max entities limit (for testing with partial data)
             if let Some(max) = self.config.max_entities {
                 if line_count >= max {
-                    println!("📊 Reached max_entities limit: {} lines (entity phase)", max);
+                    println!(
+                        "📊 Reached max_entities limit: {} lines (entity phase)",
+                        max
+                    );
                     break;
                 }
             }
@@ -1912,11 +1933,14 @@ impl WikidataImporter {
             line_count += 1;
             bytes_read += line.len() as u64 + 1; // +1 for newline character
             current_batch.push(line);
-            
+
             // Check if we've reached the max entities limit (for testing with partial data)
             if let Some(max) = self.config.max_entities {
                 if line_count >= max {
-                    println!("📊 Reached max_entities limit: {} lines (statement phase)", max);
+                    println!(
+                        "📊 Reached max_entities limit: {} lines (statement phase)",
+                        max
+                    );
                     break;
                 }
             }