@@ -8,7 +8,8 @@ use std::sync::{
88} ;
99use tracing_subscriber:: { layer:: SubscriberExt , util:: SubscriberInitExt , EnvFilter , Layer } ;
1010use trieve_server:: operators:: {
11- dataset_operator:: get_dataset_by_id_query, user_operator:: hash_function,
11+ dataset_operator:: get_dataset_by_id_query, parse_operator:: extract_text_from_html,
12+ user_operator:: hash_function,
1213} ;
1314use trieve_server:: {
1415 data:: models:: { CrawlRequest , DatasetConfiguration , RedisPool } ,
@@ -124,6 +125,10 @@ async fn crawl(
124125 semantic_boost_phrase. push_str ( & page_description) ;
125126 }
126127
128+ let cleaned_html = extract_text_from_html ( & chunk_html. clone ( ) )
129+ . trim ( )
130+ . replace ( "\n " , "" ) ;
131+
127132 let chunk = ChunkReqPayload {
128133 chunk_html : Some ( chunk_html. clone ( ) ) ,
129134 link : Some ( page_link. clone ( ) ) ,
@@ -133,7 +138,11 @@ async fn crawl(
133138 "description" : page_description. clone( ) ,
134139 "url" : page_link. clone( ) ,
135140 } ) ) ,
136- tracking_id : Some ( hash_function ( & chunk_html. clone ( ) ) ) ,
141+ tracking_id : Some ( hash_function ( & format ! (
142+ "{}{}" ,
143+ page_link. clone( ) ,
144+ cleaned_html. clone( )
145+ ) ) ) ,
137146 upsert_by_tracking_id : Some ( true ) ,
138147 group_tracking_ids : Some ( vec ! [ page_link. clone( ) ] ) ,
139148 fulltext_boost : Some ( FullTextBoost {
0 commit comments