diff --git a/readme.connections.md b/readme.connections.md index b7f419b..604fd4b 100644 --- a/readme.connections.md +++ b/readme.connections.md @@ -548,4 +548,129 @@ Debug scripts confirmed: - The zombie detection successfully identifies and cleans up these connections - Both full zombies (both sockets destroyed) and half-zombies (one socket destroyed) are handled -This fix addresses the specific issue where "connections that are closed on the inner proxy, always also close on the outer proxy" as requested by the user. \ No newline at end of file +This fix addresses the specific issue where "connections that are closed on the inner proxy, always also close on the outer proxy" as requested by the user. + +## 🔍 Production Diagnostics (January 2025) + +Since the zombie detection fix didn't fully resolve the issue, use the ProductionConnectionMonitor to diagnose the actual problem: + +### How to Use the Production Monitor + +1. **Add to your proxy startup script**: +```typescript +import ProductionConnectionMonitor from './production-connection-monitor.js'; + +// After proxy.start() +const monitor = new ProductionConnectionMonitor(proxy); +monitor.start(5000); // Check every 5 seconds + +// Monitor will automatically capture diagnostics when: +// - Connections exceed threshold (default: 50) +// - Sudden spike occurs (default: +20 connections) +``` + +2. **Diagnostics are saved to**: `.nogit/connection-diagnostics/` + +3. **Force capture anytime**: `monitor.forceCaptureNow()` + +### What the Monitor Captures + +For each connection: +- Socket states (destroyed, readable, writable, readyState) +- Connection flags (closed, keepAlive, TLS status) +- Data transfer statistics +- Time since last activity +- Cleanup queue status +- Event listener counts +- Termination reasons + +### Pattern Analysis + +The monitor automatically identifies: +- **Zombie connections**: Both sockets destroyed but not cleaned up +- **Half-zombies**: One socket destroyed +- **Stuck connecting**: Outgoing socket stuck in connecting state +- **No outgoing**: Missing outgoing socket +- **Keep-alive stuck**: Keep-alive connections with no recent activity +- **Old connections**: Connections older than 1 hour +- **No data transfer**: Connections with no bytes transferred +- **Listener leaks**: Excessive event listeners + +### Common Accumulation Patterns + +1. **Connecting State Stuck** + - Outgoing socket shows `connecting: true` indefinitely + - Usually means connection timeout not working + - Check if backend is reachable + +2. **Missing Outgoing Socket** + - Connection has no outgoing socket but isn't closed + - May indicate immediate routing issues + - Check error logs during connection setup + +3. **Event Listener Accumulation** + - High listener counts (>20) on sockets + - Indicates cleanup not removing all listeners + - Can cause memory leaks + +4. **Keep-Alive Zombies** + - Keep-alive connections not timing out + - Check keepAlive timeout settings + - May need more aggressive cleanup + +### Next Steps + +1. **Run the monitor in production** during accumulation +2. **Share the diagnostic files** from `.nogit/connection-diagnostics/` +3. **Look for patterns** in the captured snapshots +4. **Check specific connection IDs** that accumulate + +The diagnostic files will show exactly what state connections are in when accumulation occurs, allowing targeted fixes for the specific issue. + +## ✅ FIXED: Stuck Connection Detection (January 2025) + +### Additional Root Cause Found +Connections to hanging backends (that accept but never respond) were not being cleaned up because: +- Both sockets remain alive (not destroyed) +- Keep-alive prevents normal timeout +- No data is sent back to the client despite receiving data +- These don't qualify as "zombies" since sockets aren't destroyed + +### Fix Implemented +Added stuck connection detection to the periodic inactivity check: + +```typescript +// Check for stuck connections: no data sent back to client +if (!record.connectionClosed && record.outgoing && record.bytesReceived > 0 && record.bytesSent === 0) { + const age = now - record.incomingStartTime; + // If connection is older than 60 seconds and no data sent back, likely stuck + if (age > 60000) { + logger.log('warn', `Stuck connection detected: ${connectionId} - received ${record.bytesReceived} bytes but sent 0 bytes`, { + connectionId, + remoteIP: record.remoteIP, + age: plugins.prettyMs(age), + bytesReceived: record.bytesReceived, + targetHost: record.targetHost, + targetPort: record.targetPort, + component: 'connection-manager' + }); + + // Clean up + this.cleanupConnection(record, 'stuck_no_response'); + } +} +``` + +### What This Fixes +- Connections to backends that accept but never respond +- Proxy chains where inner proxy connects to unresponsive services +- Scenarios where keep-alive prevents normal timeout mechanisms +- Connections that receive client data but never send anything back + +### Detection Criteria +- Connection has received bytes from client (`bytesReceived > 0`) +- No bytes sent back to client (`bytesSent === 0`) +- Connection is older than 60 seconds +- Both sockets are still alive (not destroyed) + +This complements the zombie detection by handling cases where sockets remain technically alive but the connection is effectively dead. \ No newline at end of file diff --git a/readme.monitoring.md b/readme.monitoring.md new file mode 100644 index 0000000..cc397f0 --- /dev/null +++ b/readme.monitoring.md @@ -0,0 +1,202 @@ +# Production Connection Monitoring + +This document explains how to use the ProductionConnectionMonitor to diagnose connection accumulation issues in real-time. + +## Quick Start + +```typescript +import ProductionConnectionMonitor from './.nogit/debug/production-connection-monitor.js'; + +// After starting your proxy +const monitor = new ProductionConnectionMonitor(proxy); +monitor.start(5000); // Check every 5 seconds + +// The monitor will automatically capture diagnostics when: +// - Connections exceed 50 (default threshold) +// - Sudden spike of 20+ connections occurs +// - You manually call monitor.forceCaptureNow() +``` + +## What Gets Captured + +When accumulation is detected, the monitor saves a JSON file with: + +### Connection Details +- Socket states (destroyed, readable, writable, readyState) +- Connection age and activity timestamps +- Data transfer statistics (bytes sent/received) +- Target host and port information +- Keep-alive status +- Event listener counts + +### System State +- Memory usage +- Event loop lag +- Connection count trends +- Termination statistics + +## Reading Diagnostic Files + +Files are saved to `.nogit/connection-diagnostics/` with names like: +``` +accumulation_2025-06-07T20-20-43-733Z_force_capture.json +``` + +### Key Fields to Check + +1. **Socket States** + ```json + "incomingState": { + "destroyed": false, + "readable": true, + "writable": true, + "readyState": "open" + } + ``` + - Both destroyed = zombie connection + - One destroyed = half-zombie + - Both alive but old = potential stuck connection + +2. **Data Transfer** + ```json + "bytesReceived": 36, + "bytesSent": 0, + "timeSinceLastActivity": 60000 + ``` + - No bytes sent back = stuck connection + - High bytes but old = slow backend + - No activity = idle connection + +3. **Connection Flags** + ```json + "hasReceivedInitialData": false, + "hasKeepAlive": true, + "connectionClosed": false + ``` + - hasReceivedInitialData=false on non-TLS = immediate routing + - hasKeepAlive=true = extended timeout applies + - connectionClosed=false = still tracked + +## Common Patterns + +### 1. Hanging Backend Pattern +```json +{ + "bytesReceived": 36, + "bytesSent": 0, + "age": 120000, + "targetHost": "backend.example.com", + "incomingState": { "destroyed": false }, + "outgoingState": { "destroyed": false } +} +``` +**Fix**: The stuck connection detection (60s timeout) should clean these up. + +### 2. Zombie Connection Pattern +```json +{ + "incomingState": { "destroyed": true }, + "outgoingState": { "destroyed": true }, + "connectionClosed": false +} +``` +**Fix**: The zombie detection should clean these up within 30s. + +### 3. Event Listener Leak Pattern +```json +{ + "incomingListeners": { + "data": 15, + "error": 20, + "close": 18 + } +} +``` +**Issue**: Event listeners accumulating, potential memory leak. + +### 4. No Outgoing Socket Pattern +```json +{ + "outgoingState": { "exists": false }, + "connectionClosed": false, + "age": 5000 +} +``` +**Issue**: Connection setup failed but cleanup didn't trigger. + +## Forcing Diagnostic Capture + +To capture current state immediately: +```typescript +monitor.forceCaptureNow(); +``` + +This is useful when you notice accumulation starting. + +## Automated Analysis + +The monitor automatically analyzes patterns and logs: +- Zombie/half-zombie counts +- Stuck connection counts +- Old connection counts +- Memory usage +- Recommendations + +## Integration Example + +```typescript +// In your proxy startup script +import { SmartProxy } from '@push.rocks/smartproxy'; +import ProductionConnectionMonitor from './production-connection-monitor.js'; + +async function startProxyWithMonitoring() { + const proxy = new SmartProxy({ + // your config + }); + + await proxy.start(); + + // Start monitoring + const monitor = new ProductionConnectionMonitor(proxy); + monitor.start(5000); + + // Optional: Capture on specific events + process.on('SIGUSR1', () => { + console.log('Manual diagnostic capture triggered'); + monitor.forceCaptureNow(); + }); + + // Graceful shutdown + process.on('SIGTERM', async () => { + monitor.stop(); + await proxy.stop(); + process.exit(0); + }); +} +``` + +## Troubleshooting + +### Monitor Not Detecting Accumulation +- Check threshold settings (default: 50 connections) +- Reduce check interval for faster detection +- Use forceCaptureNow() to capture current state + +### Too Many False Positives +- Increase accumulation threshold +- Increase spike threshold +- Adjust check interval + +### Missing Diagnostic Data +- Ensure output directory exists and is writable +- Check disk space +- Verify process has write permissions + +## Next Steps + +1. Deploy the monitor to production +2. Wait for accumulation to occur +3. Share diagnostic files for analysis +4. Apply targeted fixes based on patterns found + +The diagnostic data will reveal the exact state of connections when accumulation occurs, enabling precise fixes for your specific scenario. \ No newline at end of file diff --git a/test/test.stuck-connection-cleanup.node.ts b/test/test.stuck-connection-cleanup.node.ts new file mode 100644 index 0000000..1a6356c --- /dev/null +++ b/test/test.stuck-connection-cleanup.node.ts @@ -0,0 +1,144 @@ +import { expect, tap } from '@git.zone/tstest/tapbundle'; +import * as net from 'net'; +import { SmartProxy } from '../ts/index.js'; +import * as plugins from '../ts/plugins.js'; + +tap.test('stuck connection cleanup - verify connections to hanging backends are cleaned up', async (tools) => { + console.log('\n=== Stuck Connection Cleanup Test ==='); + console.log('Purpose: Verify that connections to backends that accept but never respond are cleaned up'); + + // Create a hanging backend that accepts connections but never responds + let backendConnections = 0; + const hangingBackend = net.createServer((socket) => { + backendConnections++; + console.log(`Hanging backend: Connection ${backendConnections} received`); + // Accept the connection but never send any data back + // This simulates a hung backend service + }); + + await new Promise((resolve) => { + hangingBackend.listen(9997, () => { + console.log('✓ Hanging backend started on port 9997'); + resolve(); + }); + }); + + // Create proxy that forwards to hanging backend + const proxy = new SmartProxy({ + routes: [{ + name: 'to-hanging-backend', + match: { ports: 8589 }, + action: { + type: 'forward', + target: { host: 'localhost', port: 9997 } + } + }], + keepAlive: true, + enableDetailedLogging: false, + inactivityTimeout: 5000, // 5 second inactivity check interval for faster testing + }); + + await proxy.start(); + console.log('✓ Proxy started on port 8589'); + + // Create connections that will get stuck + console.log('\n--- Creating connections to hanging backend ---'); + const clients: net.Socket[] = []; + + for (let i = 0; i < 5; i++) { + const client = net.connect(8589, 'localhost'); + clients.push(client); + + await new Promise((resolve) => { + client.on('connect', () => { + console.log(`Client ${i} connected`); + // Send data that will never get a response + client.write(`GET / HTTP/1.1\r\nHost: localhost\r\n\r\n`); + resolve(); + }); + + client.on('error', (err) => { + console.log(`Client ${i} error: ${err.message}`); + resolve(); + }); + }); + } + + // Wait a moment for connections to establish + await plugins.smartdelay.delayFor(1000); + + // Check initial connection count + const initialCount = (proxy as any).connectionManager.getConnectionCount(); + console.log(`\nInitial connection count: ${initialCount}`); + expect(initialCount).toEqual(5); + + // Get connection details + const connections = (proxy as any).connectionManager.getConnections(); + let stuckCount = 0; + + for (const [id, record] of connections) { + if (record.bytesReceived > 0 && record.bytesSent === 0) { + stuckCount++; + console.log(`Stuck connection ${id}: received=${record.bytesReceived}, sent=${record.bytesSent}`); + } + } + + console.log(`Stuck connections found: ${stuckCount}`); + expect(stuckCount).toEqual(5); + + // Wait for inactivity check to run (it checks every 30s by default, but we set it to 5s) + console.log('\n--- Waiting for stuck connection detection (65 seconds) ---'); + console.log('Note: Stuck connections are cleaned up after 60 seconds with no response'); + + // Speed up time by manually triggering inactivity check after simulating time passage + // First, age the connections by updating their timestamps + const now = Date.now(); + for (const [id, record] of connections) { + // Simulate that these connections are 61 seconds old + record.incomingStartTime = now - 61000; + record.lastActivity = now - 61000; + } + + // Manually trigger inactivity check + console.log('Manually triggering inactivity check...'); + (proxy as any).connectionManager.performOptimizedInactivityCheck(); + + // Wait for cleanup to complete + await plugins.smartdelay.delayFor(1000); + + // Check connection count after cleanup + const afterCleanupCount = (proxy as any).connectionManager.getConnectionCount(); + console.log(`\nConnection count after cleanup: ${afterCleanupCount}`); + + // Verify termination stats + const stats = (proxy as any).connectionManager.getTerminationStats(); + console.log('\nTermination stats:', stats); + + // All connections should be cleaned up as "stuck_no_response" + expect(afterCleanupCount).toEqual(0); + + // The termination reason might be under incoming or general stats + const stuckCleanups = (stats.incoming.stuck_no_response || 0) + + (stats.outgoing?.stuck_no_response || 0); + console.log(`Stuck cleanups detected: ${stuckCleanups}`); + expect(stuckCleanups).toBeGreaterThan(0); + + // Verify clients were disconnected + let closedClients = 0; + for (const client of clients) { + if (client.destroyed) { + closedClients++; + } + } + console.log(`Closed clients: ${closedClients}/5`); + expect(closedClients).toEqual(5); + + // Cleanup + console.log('\n--- Cleanup ---'); + await proxy.stop(); + hangingBackend.close(); + + console.log('✓ Test complete: Stuck connections are properly detected and cleaned up'); +}); + +tap.start(); \ No newline at end of file diff --git a/ts/proxies/smart-proxy/connection-manager.ts b/ts/proxies/smart-proxy/connection-manager.ts index b88f1af..a72d719 100644 --- a/ts/proxies/smart-proxy/connection-manager.ts +++ b/ts/proxies/smart-proxy/connection-manager.ts @@ -495,6 +495,32 @@ export class ConnectionManager extends LifecycleComponent { this.cleanupConnection(record, 'half_zombie_cleanup'); } } + + // Check for stuck connections: no data sent back to client + if (!record.connectionClosed && record.outgoing && record.bytesReceived > 0 && record.bytesSent === 0) { + const age = now - record.incomingStartTime; + // If connection is older than 60 seconds and no data sent back, likely stuck + if (age > 60000) { + logger.log('warn', `Stuck connection detected: ${connectionId} - received ${record.bytesReceived} bytes but sent 0 bytes`, { + connectionId, + remoteIP: record.remoteIP, + age: plugins.prettyMs(age), + bytesReceived: record.bytesReceived, + targetHost: record.targetHost, + targetPort: record.targetPort, + component: 'connection-manager' + }); + + // Set termination reason and increment stats + if (record.incomingTerminationReason == null) { + record.incomingTerminationReason = 'stuck_no_response'; + this.incrementTerminationStat('incoming', 'stuck_no_response'); + } + + // Clean up + this.cleanupConnection(record, 'stuck_no_response'); + } + } } }