mikopbx
diff --git a/‎src/Common/Providers/CLAUDE.md‎
Lines changed: 2 additions & 2 deletions b/‎src/Common/Providers/CLAUDE.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Common/Providers/ManagedCacheProvider.php‎
Lines changed: 16 additions & 6 deletions b/‎src/Common/Providers/ManagedCacheProvider.php‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎src/Common/Providers/RedisClientProvider.php‎
Lines changed: 68 additions & 8 deletions b/‎src/Common/Providers/RedisClientProvider.php‎
Lines changed: 68 additions & 8 deletions
diff --git a/‎src/Core/System/Configs/RedisConf.php‎
Lines changed: 32 additions & 2 deletions b/‎src/Core/System/Configs/RedisConf.php‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎src/Core/Workers/WorkerBase.php‎
Lines changed: 100 additions & 6 deletions b/‎src/Core/Workers/WorkerBase.php‎
Lines changed: 100 additions & 6 deletions
diff --git a/‎src/Core/Workers/WorkerModelsEvents.php‎
Lines changed: 13 additions & 3 deletions b/‎src/Core/Workers/WorkerModelsEvents.php‎
Lines changed: 13 additions & 3 deletions
@@ -58,7 +58,7 @@ Providers/
 | LoggerAuthProvider | `loggerAuth` | Yes | Syslog AUTH | Auth audit trail |
 | LoggerProvider | `logger` | Yes | Syslog | UDP 127.0.0.1:514 |
 | MainDatabaseProvider | `db` | Yes | SQLite | mikopbx.db |
-| ManagedCacheProvider | `managedCache` | No | Redis DB4 | 1h TTL |
+| ManagedCacheProvider | `managedCache` | Yes | Redis DB4 | 1h TTL, shared singleton per process (issue #1022) |
 | MarketPlaceProvider | `license` | Yes | License class | Module marketplace |
 | MessagesProvider | `messages` | Yes | Files/Cache | 29 languages |
 | ModelsAnnotationsProvider | `annotations` | Yes | Memory | Model annotations |
@@ -69,7 +69,7 @@ Providers/
 | PBXConfModulesProvider | `pbxConfModules` | Yes | DB | Module hooks + priority |
 | PBXCoreRESTClientProvider | `restAPIClient` | No | HTTP | GuzzleHttp, 30s timeout |
 | RecordingStorageDatabaseProvider | `dbRecordingStorage` | Yes | SQLite | recording_storage.db |
-| RedisClientProvider | `redis` | No | Redis DB1 | Worker IPC |
+| RedisClientProvider | `redis` | Yes | Redis DB1 | Worker IPC, shared singleton per process (issue #1022) |
 | RegistryProvider | `registry` | Yes | Memory | Global state |
 | RouterProvider | `router` | No | Config | Module route integration |
 | SentryErrorHandlerProvider | `sentryErrorHandler` | No | Sentry API | Production error tracking |
 
@@ -22,9 +22,9 @@
 
 namespace MikoPBX\Common\Providers;
 
+use Phalcon\Cache\Adapter\Redis as CacheAdapterRedis;
 use Phalcon\Di\DiInterface;
 use Phalcon\Di\ServiceProviderInterface;
-use Phalcon\Cache\Adapter\Redis as CacheAdapterRedis;
 use Phalcon\Storage\SerializerFactory;
 
 /**
@@ -45,20 +45,30 @@ class ManagedCacheProvider implements ServiceProviderInterface
     public function register(DiInterface $di): void
     {
         $config = $di->getShared(ConfigProvider::SERVICE_NAME);
-        $di->set(
+        // Shared (singleton) per process — see issue #1022. Before this change
+        // the provider was registered with $di->set(), so every di->get() (and
+        // some workers call it every 5s inside BLPOP loops) created a fresh
+        // TCP socket. Those sockets accumulated until phpredis/kernel ran out
+        // of descriptors and the whole worker pool lost Redis simultaneously.
+        $di->setShared(
             self::SERVICE_NAME,
             function () use ($config) {
                 $serializerFactory = new SerializerFactory();
-
                 $options = [
                     'lifetime'          => 3600,
                     'host'              => $config->path('redis.host'),
                     'port'              => $config->path('redis.port'),
                     'index'             => self::DATABASE_INDEX,
-                    'prefix'            => self::CACHE_PREFIX
+                    'prefix'            => self::CACHE_PREFIX,
+                    'persistent'        => false,
                 ];
-
-                return new CacheAdapterRedis($serializerFactory, $options);
+                $cacheAdapter = new CacheAdapterRedis($serializerFactory, $options);
+                // Prime the underlying phpredis socket so OPT_TCP_KEEPALIVE /
+                // OPT_READ_TIMEOUT apply on a live connection. The wrapper
+                // keeps its reference to the same \Redis, so later cache
+                // operations benefit from these options transparently.
+                RedisClientProvider::primeRedisAdapter($cacheAdapter->getAdapter());
+                return $cacheAdapter;
             }
         );
     }
 
@@ -25,6 +25,7 @@
 use Phalcon\Di\ServiceProviderInterface;
 use Phalcon\Storage\Adapter\Redis as AdapterRedis;
 use Phalcon\Storage\SerializerFactory;
+use Throwable;
 
 /**
  * The RedisClientProvider class is responsible for registering the Redis client service.
@@ -38,31 +39,90 @@ class RedisClientProvider implements ServiceProviderInterface
     public const string CACHE_PREFIX = '_PH_REDIS_CLIENT:';
     public const int DATABASE_INDEX = 1;
 
+    /**
+     * phpredis OPT_READ_TIMEOUT (seconds). Controls how long a blocking read
+     * (e.g. BLPOP) waits on a silent socket before raising RedisException.
+     * This is THE primary defence against the "every 10-30 minutes" cadence
+     * in issue #1022: without it phpredis inherits the kernel TCP timeout
+     * (~15 min), so workers sit on half-dead sockets for the whole TCP
+     * retransmission window instead of surfacing the failure.
+     */
+    public const int READ_TIMEOUT = 10;
+
+    /**
+     * phpredis OPT_TCP_KEEPALIVE — in this build it is a 0/1 enable flag
+     * (phpredis 6.x normalises any positive value to 1). We enable SO_KEEPALIVE
+     * on the socket; the actual probe timing is taken from kernel sysctls
+     * (net.ipv4.tcp_keepalive_time / _intvl / _probes). On MikoPBX the kernel
+     * defaults are 7200 s / 75 s / 9 probes which is slower than we would
+     * like, but combined with OPT_READ_TIMEOUT above the client still breaks
+     * out of blocked reads within 10 s. The keepalive flag is kept as a
+     * belt-and-braces cleanup for truly-idle connections.
+     */
+    public const int TCP_KEEPALIVE_ENABLED = 1;
+
+    /**
+     * Prime a freshly created phpredis object so issue #1022 protections take
+     * effect. Phalcon's Storage\Adapter\Redis lazy-opens its socket on the
+     * first command, and {@see \Redis::OPT_TCP_KEEPALIVE} must be set on a
+     * live socket, so we have to force the connect with a ping() first.
+     *
+     * Any failure here is swallowed intentionally — the caller (worker loop,
+     * retry helper) will see the RedisException on its next real operation
+     * and deal with it via the standard backoff path.
+     */
+    public static function primeRedisAdapter(\Redis $redis): \Redis
+    {
+        try {
+            $redis->ping();
+        } catch (Throwable) {
+            // Socket is not open yet; options below still register with
+            // phpredis and will be applied the next time a connect succeeds.
+        }
+        try {
+            // OPT_TCP_KEEPALIVE is stored in phpredis' redis_sock struct
+            // here but is only actually applied to the OS socket on the
+            // NEXT connect()/reconnect() — the setsockopt() call lives in
+            // phpredis' RedisSock_connect() path. So on the current socket
+            // we rely on OPT_READ_TIMEOUT below; OPT_TCP_KEEPALIVE is a
+            // belt-and-braces flag for the reconnect case.
+            $redis->setOption(\Redis::OPT_TCP_KEEPALIVE, self::TCP_KEEPALIVE_ENABLED);
+            $redis->setOption(\Redis::OPT_READ_TIMEOUT, self::READ_TIMEOUT);
+        } catch (Throwable) {
+            // Some builds of phpredis refuse setOption before the first
+            // successful connect — not fatal for us, next reconnect retries.
+        }
+        return $redis;
+    }
 
     /**
      * Register the Redis client service provider.
      *
      * @param DiInterface $di The DI container.
      */
     public function register(DiInterface $di): void
-    {  
+    {
         $config = $di->getShared(ConfigProvider::SERVICE_NAME);
-        $di->set(
+        // Shared (singleton) per process: a worker or php-fpm child reuses
+        // one \Redis socket for its entire lifetime instead of opening a new
+        // TCP connection on every di->get() call inside the main loop.
+        // See issue #1022 root-cause analysis.
+        $di->setShared(
             self::SERVICE_NAME,
-            function () use ($di, $config       ) {
+            function () use ($config) {
                 $serializerFactory = new SerializerFactory();
-
                 $options = [
                     'defaultSerializer' => 'Php',
                     'lifetime'          => 3600,
                     'host'              => $config->path('redis.host'),
                     'port'              => $config->path('redis.port'),
                     'index'             => self::DATABASE_INDEX,
-                    'prefix'            => self::CACHE_PREFIX
+                    'prefix'            => self::CACHE_PREFIX,
+                    'persistent'        => false,
                 ];
-
-                return (new AdapterRedis($serializerFactory, $options))->getAdapter();
+                $adapter = new AdapterRedis($serializerFactory, $options);
+                return self::primeRedisAdapter($adapter->getAdapter());
             }
         );
     }
-} 
+}
@@ -127,12 +127,25 @@ public function generateMonitConf(): bool{
 
         $busyboxPath = Util::which('busybox');
         $confPath    = $this->getMainMonitConfFile();
-
+        $redisPort   = $this->port !== '' ? $this->port : '6379';
+
+        // Monit now also runs a TCP PING/PONG probe on the Redis port on top
+        // of the pidfile check. This catches frozen-but-alive Redis processes
+        // (blocked event loop, stuck I/O) that the PID-only check could not
+        // detect — see issue #1022. `for 3 cycles` at the 5 s daemon tick
+        // means the probe must fail for ~15 s before monit restarts Redis,
+        // which avoids flapping restarts on brief hiccups. The hard limit of
+        // 5 restarts per 10 cycles is a safety valve against restart loops.
         $conf = 'check process '.self::PROC_NAME.' with pidfile /var/run/redis.pid '.PHP_EOL.
             '    start program = "'.$this->startCommand.'"'.PHP_EOL.
             '        as uid root and gid root'.PHP_EOL.
             '    stop program = "'.$busyboxPath.' sh -c \''.$busyboxPath.' killall '.self::PROC_NAME.'; rm -f /var/run/redis.pid\''.'"'.PHP_EOL.
-            '        as uid root and gid root';
+            '        as uid root and gid root'.PHP_EOL.
+            '    if failed port '.$redisPort.' type tcp protocol default'.PHP_EOL.
+            '       send "PING\r\n" expect "PONG" timeout 5 seconds'.PHP_EOL.
+            '       for 3 cycles'.PHP_EOL.
+            '    then restart'.PHP_EOL.
+            '    if 5 restarts within 10 cycles then timeout';
 
         $this->saveFileContent($confPath, $conf);
         return true;
@@ -167,6 +180,23 @@ private function configure(): void
         # Prevents write-blocking when /var/tmp (tmpfs) overflows (issue #651).
         $conf  .= "save \"\"" . PHP_EOL;
         $conf  .= "stop-writes-on-bgsave-error no" . PHP_EOL;
+
+        # Client/memory caps (issue #1022).
+        #  - maxclients: hard ceiling so leaked or stale sockets cannot pile
+        #    up to the phpredis/Redis default limit and knock out all three
+        #    WorkerApiCommands instances simultaneously. 300 leaves ample
+        #    headroom for the usual 50-ish active clients (3 API workers +
+        #    php-fpm pool + module workers).
+        #  - maxmemory: small systems must not let Redis grow unbounded and
+        #    trigger OOM killer. 64 MB matches the current working set
+        #    (heartbeat keys + metadata cache + short-lived queues).
+        #  - maxmemory-policy allkeys-lru: evict the oldest keys instead of
+        #    rejecting writes under pressure (the default `noeviction` would
+        #    otherwise make every `set` fail when the cap is reached).
+        $conf  .= "maxclients 300" . PHP_EOL;
+        $conf  .= "maxmemory 64mb" . PHP_EOL;
+        $conf  .= "maxmemory-policy allkeys-lru" . PHP_EOL;
+
         file_put_contents(self::CONF_FILE, $conf);
     }
 
 
@@ -137,6 +137,17 @@ abstract class WorkerBase extends Injectable implements WorkerInterface
      */
     protected int $workerState = self::STATE_STARTING;
 
+    /**
+     * Redis client held by this worker. Can be a raw phpredis \Redis (from
+     * RedisClientProvider) or a Phalcon cache wrapper (from
+     * ManagedCacheProvider) depending on the subclass. Declared explicitly
+     * to avoid the PHP 8.2+ dynamic-property deprecation AND to stop
+     * Phalcon\Di\Injectable::__get() from silently resurrecting a fresh
+     * connection inside destructors or signal handlers — a real hazard
+     * raised in the #1022 code review.
+     */
+    protected mixed $redis = null;
+
 
 
     /**
@@ -185,6 +196,90 @@ public static function getCheckInterval(): int
         return self::KEEP_ALLIVE_CHECK_INTERVAL;
     }
 
+    /**
+     * Run a Redis operation with a short exponential backoff retry loop.
+     *
+     * Used to survive transient Redis glitches without dropping in-flight
+     * jobs. The backoff is intentionally short (100 ms / 200 ms / 400 ms)
+     * so the BLPOP main loop does not block for more than ~1 s total —
+     * if Redis is down for longer, the outer main-loop catch handles the
+     * extended outage with a larger backoff and a single syslog marker
+     * (`reason=redis_unreachable_extended`) instead of spamming Sentry.
+     *
+     * Phalcon Storage\Adapter\Redis reconnects automatically through its
+     * `checkConnect()` on every operation, so we only need to retry and
+     * wait; there is no need to tear down the shared adapter here.
+     *
+     * Introduced for issue #1022.
+     *
+     * @template T
+     * @param callable(): T $op          Redis operation to execute.
+     * @param int           $maxAttempts Attempts including the first try.
+     * @return T
+     * @throws \RedisException|\Phalcon\Storage\Exception on terminal failure
+     * @throws \InvalidArgumentException when $maxAttempts is < 1
+     */
+    protected function withRedisRetry(callable $op, int $maxAttempts = 3): mixed
+    {
+        if ($maxAttempts < 1) {
+            throw new \InvalidArgumentException(
+                'withRedisRetry: $maxAttempts must be >= 1, got ' . $maxAttempts
+            );
+        }
+        $lastException = null;
+        for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) {
+            try {
+                return $op();
+            } catch (\RedisException | \Phalcon\Storage\Exception $e) {
+                $lastException = $e;
+                if ($attempt === $maxAttempts) {
+                    break;
+                }
+                // 100 ms, 200 ms, 400 ms — geometric, capped at 2 s total.
+                // The shift is clamped to prevent integer overflow when a
+                // subclass passes a pathologically large $maxAttempts.
+                $shift = min($attempt - 1, 20);
+                usleep(min((1 << $shift) * 100_000, 2_000_000));
+            }
+        }
+        throw $lastException;
+    }
+
+    /**
+     * Best-effort close of the phpredis socket held by this worker.
+     *
+     * Hoisted here (instead of WorkerRedisBase) so that every subclass —
+     * Beanstalk workers that touch ManagedCacheProvider, Redis-pool
+     * workers, and shutdown signal handlers — can release their socket
+     * without duplicating the wrapper-vs-raw detection logic.
+     *
+     * Safe to call multiple times and to call when Redis is unreachable:
+     * any error is swallowed so the shutdown path never blocks.
+     * Introduced for issue #1022 — prevents the "200+ stale Redis
+     * connections from terminated PHP workers" pattern documented in
+     * commit e2e191abb.
+     */
+    protected function closeRedis(): void
+    {
+        try {
+            if (isset($this->redis) && is_object($this->redis)) {
+                if (method_exists($this->redis, 'close')) {
+                    // phpredis \Redis::close() — hard close the socket.
+                    @$this->redis->close();
+                } elseif (method_exists($this->redis, 'getAdapter')) {
+                    // Phalcon cache wrapper — reach through to phpredis.
+                    $inner = $this->redis->getAdapter();
+                    if (is_object($inner) && method_exists($inner, 'close')) {
+                        @$inner->close();
+                    }
+                }
+            }
+        } catch (Throwable) {
+            // Ignore — the whole point of this method is to release the
+            // socket on the way out, not to surface errors.
+        }
+    }
+
     /**
      * Sets resource limits for the worker process
      */
@@ -377,12 +472,11 @@ public function signalHandler(int $signal): void
             case SIGINT:
                 $this->setWorkerState(self::STATE_STOPPING);
 
-                // Cleanup for Redis-based workers
-                if ($this instanceof WorkerRedisBase) {
-                    if ($this->redis) {
-                        $this->redis->close();
-                    }
-                }
+                // Release the Redis socket via the wrapper-aware helper
+                // (the raw `$this->redis->close()` path would bomb on a
+                // Phalcon cache wrapper now that providers can return one —
+                // see issue #1022 code review BLOCKER).
+                $this->closeRedis();
                 exit(0);
 
             default:
 
@@ -127,7 +127,15 @@ public static function getCheckInterval(): int
     }
 
     /**
-     * Save worker state to Redis
+     * Save worker state to Redis.
+     *
+     * Called from the shutdown path (handleShutdownSignal), so the single
+     * retry matters: without it, losing the shutdown-time write means the
+     * queued reload actions are gone on the next restart. This was Sentry
+     * issue #27017 under #1022 — `Connection lost` in Redis::set during
+     * shutdown. One retry with the short-backoff helper is cheap insurance
+     * and survives both a stale-socket case (retry reopens) and a flapping
+     * Redis case (retry waits 100 ms).
      */
     private function saveStateToRedis(): void
     {
@@ -138,9 +146,11 @@ private function saveStateToRedis(): void
                 'last_change' => $this->last_change,
                 'timestamp' => time()
             ];
-            
+
             $workerKey = self::REDIS_PREFIX . self::class . ':' . getmypid();
-            $this->managedCache->set($workerKey, $state, self::REDIS_TTL);
+            $this->withRedisRetry(
+                fn() => $this->managedCache->set($workerKey, $state, self::REDIS_TTL)
+            );
         } catch (Throwable $e) {
             CriticalErrorsHandler::handleExceptionWithSyslog($e);
         }