Skip to content

Commit 2601502

Browse files
committed
feat: add compound embedding search coverage
Use a compound aliases+description embedding index for Wikidata semantic search, add a compound embedding test and large-scale HNSW quality test, and tighten recovery script result expectations.
1 parent 1030881 commit 2601502

5 files changed

Lines changed: 396 additions & 57 deletions

File tree

scripts/test_wikidata_import_recovery.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,6 @@ EOF
424424
# Extract the result count
425425
local result_line
426426
result_line=$(echo "$output" | grep "Semantic search results" | head -1)
427-
echo -e "${GREEN} ✓ Query: '$query'${NC}"
428427
echo " $result_line"
429428

430429
# Show debug info if available
@@ -468,8 +467,13 @@ EOF
468467
echo " ... and $((result_count - 3)) more results"
469468
fi
470469
echo ""
471-
472-
successful_queries=$((successful_queries + 1))
470+
471+
if [[ $result_count -ge 6 ]]; then
472+
echo -e "${GREEN} ✓ Query: '$query' (>= 6 results)${NC}"
473+
successful_queries=$((successful_queries + 1))
474+
else
475+
echo -e "${YELLOW} ~ Query: '$query' - Only $result_count results (expected >= 6)${NC}"
476+
fi
473477

474478
elif echo "$output" | grep -qi "embedding.*not.*initialized\|index.*not.*found"; then
475479
echo -e "${RED} ✗ Query: '$query' - Embedding index not available${NC}"

src/apps/embedding/tests.rs

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1295,6 +1295,234 @@ mod schema_based_tests {
12951295
println!("=== Multi-Index Schema Test PASSED ===");
12961296
}
12971297

1298+
/// Test compound embedding index (e.g., aliases + description in Wikidata)
1299+
#[tokio::test]
1300+
#[ignore]
1301+
async fn test_compound_embedding_index() {
1302+
let _ = env_logger::try_init();
1303+
1304+
println!("=== Compound Embedding Index Test ===\n");
1305+
1306+
const SCIENTIST_SCHEMA_ID: u32 = 10004;
1307+
const ID_FIELD: &str = "id";
1308+
const NAME_FIELD: &str = "name";
1309+
const ALIASES_FIELD: &str = "aliases";
1310+
const DESCRIPTION_FIELD: &str = "description";
1311+
const COMPOUND_NAME: &str = "aliases_description";
1312+
let compound_field_id = bifrost_hasher::hash_str(COMPOUND_NAME);
1313+
1314+
fn scientist_schema() -> Schema {
1315+
let embedding_model = EmbeddingModel::new("gguf:nomic-embed-text-v1.5");
1316+
let fields = vec![
1317+
Field::new_unindexed(ID_FIELD, Type::String),
1318+
Field::new_unindexed(NAME_FIELD, Type::String),
1319+
Field::new_unindexed_array(ALIASES_FIELD, Type::String),
1320+
Field::new_unindexed(DESCRIPTION_FIELD, Type::String),
1321+
];
1322+
let mut schema = Schema::new_with_id(
1323+
SCIENTIST_SCHEMA_ID,
1324+
"test_scientist",
1325+
None,
1326+
Field::new_schema(fields),
1327+
false,
1328+
false,
1329+
);
1330+
schema.add_compound_index(
1331+
COMPOUND_NAME,
1332+
vec![ALIASES_FIELD.to_string(), DESCRIPTION_FIELD.to_string()],
1333+
vec![IndexType::Embedding(embedding_model)],
1334+
);
1335+
schema
1336+
}
1337+
1338+
let server = start_server(6014, "compound_embedding_test")
1339+
.await
1340+
.unwrap();
1341+
let _partition = server.init_hnsw_index_partition_service().await.unwrap();
1342+
let _hnsw = server.init_hnsw_index_service().await.unwrap();
1343+
crate::server::MorpheusServer::init_embedding_service(&server)
1344+
.await
1345+
.unwrap();
1346+
1347+
let embedding_client = &server
1348+
.neb_server
1349+
.indexer
1350+
.as_ref()
1351+
.unwrap()
1352+
.clients
1353+
.embedding_client;
1354+
1355+
if !embedding_client.is_embedding_index_core_set() {
1356+
eprintln!("✗ Embedding core not set, skipping test");
1357+
return;
1358+
}
1359+
1360+
server
1361+
.neb_client
1362+
.new_schema_with_id(scientist_schema())
1363+
.await
1364+
.unwrap()
1365+
.unwrap();
1366+
println!("✓ Scientist schema with compound embedding registered (index auto-created)");
1367+
1368+
let scientists = vec![
1369+
(
1370+
"Albert Einstein",
1371+
vec!["Einstein", "Albert", "theoretical physicist"],
1372+
"developed the theory of relativity and contributed to quantum mechanics",
1373+
),
1374+
(
1375+
"Isaac Newton",
1376+
vec!["Newton", "Isaac", "mathematician", "physicist"],
1377+
"formulated laws of motion and universal gravitation",
1378+
),
1379+
(
1380+
"Marie Curie",
1381+
vec!["Curie", "Marie", "Skłodowska", "physicist", "chemist"],
1382+
"pioneering research on radioactivity and discovered polonium and radium",
1383+
),
1384+
];
1385+
1386+
let mut scientist_ids: Vec<(Id, &str)> = Vec::new();
1387+
1388+
println!("\nWriting scientists...");
1389+
for (name, aliases, description) in &scientists {
1390+
let id = Id::rand();
1391+
let mut map = OwnedMap::new();
1392+
map.insert(ID_FIELD, OwnedValue::String(name.to_string()));
1393+
map.insert(NAME_FIELD, OwnedValue::String(name.to_string()));
1394+
map.insert(
1395+
ALIASES_FIELD,
1396+
OwnedValue::PrimArray(dovahkiin::types::OwnedPrimArray::String(
1397+
aliases.iter().map(|s| s.to_string()).collect(),
1398+
)),
1399+
);
1400+
map.insert(
1401+
DESCRIPTION_FIELD,
1402+
OwnedValue::String(description.to_string()),
1403+
);
1404+
1405+
let cell = OwnedCell::new_with_id(SCIENTIST_SCHEMA_ID, &id, OwnedValue::Map(map));
1406+
server.neb_client.write_cell(cell).await.unwrap().unwrap();
1407+
1408+
println!(" ✓ {} written", name);
1409+
scientist_ids.push((id, name));
1410+
}
1411+
1412+
tokio::time::sleep(tokio::time::Duration::from_millis(1000)).await;
1413+
1414+
println!("\n=== Testing Compound Index Search ===\n");
1415+
1416+
let query1 = "physicist relativity";
1417+
println!("Query: \"{}\"", query1);
1418+
let results1 = embedding_client
1419+
.search(SCIENTIST_SCHEMA_ID, compound_field_id, query1, 3)
1420+
.await
1421+
.unwrap();
1422+
1423+
println!("Results:");
1424+
for (i, hit) in results1.iter().enumerate() {
1425+
let name = scientist_ids
1426+
.iter()
1427+
.find(|(id, _)| *id == hit.id)
1428+
.map(|(_, n)| *n)
1429+
.unwrap_or("unknown");
1430+
println!(" {}. {} (score: {:.4})", i + 1, name, hit.score);
1431+
}
1432+
1433+
let einstein_id = scientist_ids
1434+
.iter()
1435+
.find(|(_, n)| n.contains("Einstein"))
1436+
.unwrap()
1437+
.0;
1438+
assert_eq!(
1439+
results1[0].id, einstein_id,
1440+
"Einstein should be top for relativity query"
1441+
);
1442+
println!("✓ Correct: Einstein is top result\n");
1443+
1444+
let query2 = "mathematician gravity";
1445+
println!("Query: \"{}\"", query2);
1446+
let results2 = embedding_client
1447+
.search(SCIENTIST_SCHEMA_ID, compound_field_id, query2, 3)
1448+
.await
1449+
.unwrap();
1450+
1451+
println!("Results:");
1452+
for (i, hit) in results2.iter().enumerate() {
1453+
let name = scientist_ids
1454+
.iter()
1455+
.find(|(id, _)| *id == hit.id)
1456+
.map(|(_, n)| *n)
1457+
.unwrap_or("unknown");
1458+
println!(" {}. {} (score: {:.4})", i + 1, name, hit.score);
1459+
}
1460+
1461+
let newton_id = scientist_ids
1462+
.iter()
1463+
.find(|(_, n)| n.contains("Newton"))
1464+
.unwrap()
1465+
.0;
1466+
assert_eq!(
1467+
results2[0].id, newton_id,
1468+
"Newton should be top for gravity query"
1469+
);
1470+
println!("✓ Correct: Newton is top result\n");
1471+
1472+
let query3 = "chemist radioactivity";
1473+
println!("Query: \"{}\"", query3);
1474+
let results3 = embedding_client
1475+
.search(SCIENTIST_SCHEMA_ID, compound_field_id, query3, 3)
1476+
.await
1477+
.unwrap();
1478+
1479+
println!("Results:");
1480+
for (i, hit) in results3.iter().enumerate() {
1481+
let name = scientist_ids
1482+
.iter()
1483+
.find(|(id, _)| *id == hit.id)
1484+
.map(|(_, n)| *n)
1485+
.unwrap_or("unknown");
1486+
println!(" {}. {} (score: {:.4})", i + 1, name, hit.score);
1487+
}
1488+
1489+
let curie_id = scientist_ids
1490+
.iter()
1491+
.find(|(_, n)| n.contains("Curie"))
1492+
.unwrap()
1493+
.0;
1494+
assert_eq!(
1495+
results3[0].id, curie_id,
1496+
"Curie should be top for radioactivity query"
1497+
);
1498+
println!("✓ Correct: Curie is top result\n");
1499+
1500+
println!("=== Testing Alias Contribution ===");
1501+
let query_alias = "Skłodowska";
1502+
println!("Query (alias only): \"{}\"", query_alias);
1503+
let results_alias = embedding_client
1504+
.search(SCIENTIST_SCHEMA_ID, compound_field_id, query_alias, 3)
1505+
.await
1506+
.unwrap();
1507+
1508+
println!("Results:");
1509+
for (i, hit) in results_alias.iter().enumerate() {
1510+
let name = scientist_ids
1511+
.iter()
1512+
.find(|(id, _)| *id == hit.id)
1513+
.map(|(_, n)| *n)
1514+
.unwrap_or("unknown");
1515+
println!(" {}. {} (score: {:.4})", i + 1, name, hit.score);
1516+
}
1517+
assert_eq!(
1518+
results_alias[0].id, curie_id,
1519+
"Curie should match her maiden name Skłodowska"
1520+
);
1521+
println!("✓ Correct: Alias contributes to matching\n");
1522+
1523+
println!("=== Compound Embedding Index Test PASSED ===");
1524+
}
1525+
12981526
/// Test embedding on array fields (like the aliases field in Wikidata)
12991527
#[tokio::test]
13001528
#[ignore]

0 commit comments

Comments
 (0)