4949 # Skip raw→Sv stages; use existing Sv zarrs from output container
5050 python build_full_survey.py --skip-raw
5151
52+ # Re-denoise from existing Sv zarrs (skip raw→Sv, re-run denoise→products)
53+ python build_full_survey.py --skip-sv
54+
5255 # 38 kHz only (skip 200 kHz)
5356 python build_full_survey.py --freq 38
5457
@@ -1593,9 +1596,12 @@ def plot_combined_echogram(
15931596
15941597
15951598# ---------------------------------------------------------------------------
1596- # List existing denoised zarrs (for --skip-denoise / --resume)
1599+ # List existing Sv / denoised zarrs
15971600# ---------------------------------------------------------------------------
15981601
1602+ _SV_RE = re .compile (
1603+ r"(\d{4}-\d{2}-\d{2})--(\w+)\.zarr$"
1604+ )
15991605_DENOISED_RE = re .compile (
16001606 r"(\d{4}-\d{2}-\d{2})--(\w+)--denoised\.zarr$"
16011607)
@@ -1604,6 +1610,72 @@ def plot_combined_echogram(
16041610)
16051611
16061612
1613+ def _list_sv_local (local_root : Path ) -> list [tuple [str , str , str ]]:
1614+ """Scan local disk for Sv zarr directories (not denoised/mvbs/nasc)."""
1615+ results : list [tuple [str , str , str ]] = []
1616+ for day_dir in sorted (local_root .iterdir ()):
1617+ if not day_dir .is_dir () or not re .match (r"\d{4}-\d{2}-\d{2}$" , day_dir .name ):
1618+ continue
1619+ for item in day_dir .iterdir ():
1620+ # Match {day}--{category}.zarr but NOT {day}--{category}--denoised.zarr etc.
1621+ if "--denoised" in item .name or "--mvbs" in item .name or "--nasc" in item .name :
1622+ continue
1623+ m = _SV_RE .match (item .name )
1624+ if m and item .is_dir ():
1625+ day_key = m .group (1 )
1626+ category = m .group (2 )
1627+ zarr_path = f"{ day_key } /{ item .name } "
1628+ results .append ((day_key , category , zarr_path ))
1629+ results .sort ()
1630+ return results
1631+
1632+
1633+ def list_sv_zarrs (container : str ) -> list [tuple [str , str , str ]]:
1634+ """List Sv zarr paths from local disk or Azure.
1635+
1636+ Returns sorted list of (day_key, category, zarr_path).
1637+ Only returns raw Sv zarrs, not denoised/mvbs/nasc variants.
1638+ """
1639+ # Try local disk first
1640+ try :
1641+ from local_storage import _OUTPUT_ROOT
1642+ local_root = _OUTPUT_ROOT / container
1643+ if local_root .exists ():
1644+ return _list_sv_local (local_root )
1645+ except ImportError :
1646+ pass
1647+
1648+ # Fall back to Azure Blob
1649+ from azure .storage .blob import ContainerClient
1650+
1651+ conn_str = _connection_string ()
1652+ client = ContainerClient .from_connection_string (conn_str , container )
1653+
1654+ # Match {day}/{day}--{category}.zarr/ but exclude denoised/mvbs/nasc
1655+ pattern = re .compile (
1656+ r"(\d{4}-\d{2}-\d{2})/\d{4}-\d{2}-\d{2}--(\w+)\.zarr/(zarr\.json|\.zmetadata|\.zattrs)$"
1657+ )
1658+
1659+ results : list [tuple [str , str , str ]] = []
1660+ seen : set [str ] = set ()
1661+ for blob in client .list_blobs (name_starts_with = "2023-" ):
1662+ # Skip denoised / mvbs / nasc
1663+ if "--denoised" in blob .name or "--mvbs" in blob .name or "--nasc" in blob .name :
1664+ continue
1665+ m = pattern .search (blob .name )
1666+ if m :
1667+ day_key = m .group (1 )
1668+ category = m .group (2 )
1669+ zarr_path = blob .name .rsplit ("/" , 1 )[0 ]
1670+ key = f"{ day_key } /{ category } "
1671+ if key not in seen :
1672+ seen .add (key )
1673+ results .append ((day_key , category , zarr_path ))
1674+
1675+ results .sort ()
1676+ return results
1677+
1678+
16071679def _list_denoised_local (local_root : Path ) -> list [tuple [str , str , str ]]:
16081680 """Scan local disk for denoised zarr directories."""
16091681 results : list [tuple [str , str , str ]] = []
@@ -2442,29 +2514,33 @@ def run_full_pipeline(args: argparse.Namespace) -> None:
24422514 sys .exit (1 )
24432515
24442516 # ── Stage 1: Discover raw EK80 files ────────────────────────
2445- log .info ("=" * 70 )
2446- log .info ("STAGE 1: Discover raw EK80 files" )
2447- log .info ("=" * 70 )
2448- t0 = time .time ()
2517+ day_files : dict = {}
2518+ if not args .skip_sv and not args .skip_raw :
2519+ log .info ("=" * 70 )
2520+ log .info ("STAGE 1: Discover raw EK80 files" )
2521+ log .info ("=" * 70 )
2522+ t0 = time .time ()
24492523
2450- raw_files = discover_raw_files (
2451- start_date , end_date ,
2452- file_share_name = args .file_share ,
2453- file_share_path = args .file_share_path ,
2454- )
2455- if not raw_files :
2456- log .error ("No raw files found — check file share" )
2457- sys .exit (1 )
2524+ raw_files = discover_raw_files (
2525+ start_date , end_date ,
2526+ file_share_name = args .file_share ,
2527+ file_share_path = args .file_share_path ,
2528+ )
2529+ if not raw_files :
2530+ log .error ("No raw files found — check file share" )
2531+ sys .exit (1 )
24582532
2459- day_files = group_raw_by_day (raw_files )
2460- log .info (
2461- "Stage 1 complete: %d raw files across %d days (%.1fs)" ,
2462- len (raw_files ), len (day_files ), time .time () - t0 ,
2463- )
2533+ day_files = group_raw_by_day (raw_files )
2534+ log .info (
2535+ "Stage 1 complete: %d raw files across %d days (%.1fs)" ,
2536+ len (raw_files ), len (day_files ), time .time () - t0 ,
2537+ )
2538+ else :
2539+ log .info ("Skipping Stage 1 (raw file discovery) — using existing zarrs" )
24642540
24652541 # ── Stage 2: Download GPS GeoParquet ────────────────────────
24662542 gps_df = None
2467- if not args .skip_gps :
2543+ if not args .skip_gps and not args . skip_sv :
24682544 log .info ("=" * 70 )
24692545 log .info ("STAGE 2: Download GPS GeoParquet" )
24702546 log .info ("=" * 70 )
@@ -2514,6 +2590,32 @@ def run_full_pipeline(args: argparse.Namespace) -> None:
25142590 day_denoised_zarrs .setdefault (day_key , {})[category ] = zarr_path
25152591 sv_path = f"{ day_key } /{ day_key } --{ category } .zarr"
25162592 day_sv_zarrs .setdefault (day_key , {})[category ] = sv_path
2593+ elif args .skip_sv :
2594+ log .info ("--skip-sv: loading existing Sv zarrs, re-running denoise → products" )
2595+ sv_entries = list_sv_zarrs (output_container )
2596+ if start_date or end_date :
2597+ sv_entries = [
2598+ (d , c , p ) for d , c , p in sv_entries
2599+ if (not start_date or datetime .fromisoformat (d ) >= start_date )
2600+ and (not end_date or datetime .fromisoformat (d ) <= end_date )
2601+ ]
2602+ log .info (" Found %d Sv zarrs to re-denoise" , len (sv_entries ))
2603+ for day_key , category , sv_path in sv_entries :
2604+ day_sv_zarrs .setdefault (day_key , {})[category ] = sv_path
2605+
2606+ # Re-denoise each Sv zarr
2607+ for day_key , categories in sorted (day_sv_zarrs .items ()):
2608+ for category , sv_path in categories .items ():
2609+ try :
2610+ denoised_path = denoise_day_zarr (
2611+ sv_path , output_container , day_key , category ,
2612+ )
2613+ day_denoised_zarrs .setdefault (day_key , {})[category ] = denoised_path
2614+ all_denoised .append ((day_key , category , denoised_path ))
2615+ except Exception as e :
2616+ log .error (" Denoise failed %s/%s: %s" , day_key , category , e )
2617+ _release_memory ()
2618+ log .info (" Re-denoised %d zarrs across %d days" , len (all_denoised ), len (day_denoised_zarrs ))
25172619 else :
25182620 # Filter days for --resume
25192621 days_to_process = []
@@ -2861,6 +2963,10 @@ def main() -> None:
28612963 "--skip-raw" , action = "store_true" ,
28622964 help = "Skip raw→Sv stages; use existing Sv + denoised zarrs from output container." ,
28632965 )
2966+ parser .add_argument (
2967+ "--skip-sv" , action = "store_true" ,
2968+ help = "Skip raw→Sv stages; re-run denoising from existing Sv zarrs, then MVBS/NASC/echograms." ,
2969+ )
28642970 parser .add_argument (
28652971 "--skip-gps" , action = "store_true" ,
28662972 help = "Skip GPS GeoParquet download." ,
0 commit comments