From 6565c44c29f162a8de7c664db382aa32fbcf4f3b Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Sun, 24 May 2026 12:47:15 +0000 Subject: [PATCH] feat(cloudly): accept Spark node heartbeats --- changelog.md | 10 +++ package.json | 6 +- pnpm-lock.yaml | 38 +++++--- ts/classes.server.ts | 5 ++ ts/manager.jump/classes.jumpmanager.ts | 8 +- ts/manager.node/classes.clusternode.ts | 19 ++++ ts/manager.node/classes.nodemanager.ts | 120 +++++++++++++++++++++++++ 7 files changed, 189 insertions(+), 17 deletions(-) diff --git a/changelog.md b/changelog.md index f55be8c..542e95f 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,16 @@ ## Pending +### Features + +- accept Spark node heartbeats + - Adds a Spark heartbeat HTTP endpoint for cluster nodes + - Stores Spark metrics and runtime info on cluster node records + - Extends jump onboarding with per-node Spark telemetry credentials + +### Maintenance + +- refresh release tooling dependencies ## 2026-05-24 - 5.8.2 diff --git a/package.json b/package.json index 525e116..628cdd2 100644 --- a/package.json +++ b/package.json @@ -23,15 +23,15 @@ "docs": "tsdoc aidoc" }, "devDependencies": { - "@git.zone/tsbuild": "^4.4.0", + "@git.zone/tsbuild": "^4.4.1", "@git.zone/tsbundle": "^2.10.4", "@git.zone/tsdoc": "^2.0.5", - "@git.zone/tsdocker": "^2.2.6", + "@git.zone/tsdocker": "^2.3.0", "@git.zone/tspublish": "^1.11.7", "@git.zone/tstest": "^3.6.6", "@git.zone/tswatch": "^3.3.5", "@push.rocks/smartnetwork": "^4.7.1", - "@types/node": "^25.6.2" + "@types/node": "^25.8.0" }, "dependencies": { "@api.global/typedrequest": "3.3.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 826fd34..fdededb 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -151,8 +151,8 @@ importers: version: 9.5.1 devDependencies: '@git.zone/tsbuild': - specifier: ^4.4.0 - version: 4.4.0 + specifier: ^4.4.1 + version: 4.4.1 '@git.zone/tsbundle': specifier: ^2.10.4 version: 2.10.4 @@ -160,8 +160,8 @@ importers: specifier: ^2.0.5 version: 2.0.5(ws@8.20.0)(zod@4.4.3) '@git.zone/tsdocker': - specifier: ^2.2.6 - version: 2.2.6 + specifier: ^2.3.0 + version: 2.3.0 '@git.zone/tspublish': specifier: ^1.11.7 version: 1.11.7 @@ -175,8 +175,8 @@ importers: specifier: ^4.7.1 version: 4.7.1 '@types/node': - specifier: ^25.6.2 - version: 25.6.2 + specifier: ^25.8.0 + version: 25.8.0 packages: @@ -831,8 +831,8 @@ packages: '@gerrit0/mini-shiki@3.23.0': resolution: {integrity: sha512-bEMORlG0cqdjVyCEuU0cDQbORWX+kYCeo0kV1lbxF5bt4r7SID2l9bqsxJEM0zndaxpOUT7riCyIVEuqq/Ynxg==} - '@git.zone/tsbuild@4.4.0': - resolution: {integrity: sha512-98igHfppi6blFYDyzNukNkj4FUO5ZlyXEaSyJh8vCkkZM8SyAgfZj+NUWA1D1iaPXE58UvK1Pt/o8p8iI9UHHw==} + '@git.zone/tsbuild@4.4.1': + resolution: {integrity: sha512-usxx8BBQsAypxjFOfd1GEV9pL9EUshRKktXtRWHMDByb6ps83+PdUIb3D7O+nkkBp4C9PXo3cfbsR4Asvo33CA==} hasBin: true '@git.zone/tsbundle@2.10.1': @@ -847,8 +847,8 @@ packages: resolution: {integrity: sha512-s0Jbq9q1lvPppaIsLRr0VJR5lJn9bBzSr4POssXHKFJlVXRU5UeefR7sRERXNYz45FUCXLn+PLAB786PKEAKXg==} hasBin: true - '@git.zone/tsdocker@2.2.6': - resolution: {integrity: sha512-vF0QT5od+t7UyWT8dQt6grybAiVx8EhpH6OZoMsleOrAyLMHEcfAKaPfTELXFnF5A+GPhAree+3KpRGyp5cGCg==} + '@git.zone/tsdocker@2.3.0': + resolution: {integrity: sha512-im2hD3Fu7vSb6qM+WMg2tbvLbFfEpX8qVmjy491R5iELky4Pw9cqRMkwzmxW92etn8v+f53ODUQDOoc9DufX2A==} hasBin: true '@git.zone/tspublish@1.11.6': @@ -2509,6 +2509,9 @@ packages: '@types/node@25.6.2': resolution: {integrity: sha512-sokuT28dxf9JT5Kady1fsXOvI4HVpjZa95NKT5y9PNTIrs2AsobR4GFAA90ZG8M+nxVRLysCXsVj6eGC7Vbrlw==} + '@types/node@25.8.0': + resolution: {integrity: sha512-TCFSk8IZh+iLX1xtksoBVtdmgL+1IX0fC9BeU4QqFSuNdN/K+HUlhqOzEmSYYpZUVsLYcPqc9KX+60iDuninSQ==} + '@types/randomatic@3.1.5': resolution: {integrity: sha512-VCwCTw6qh1pRRw+5rNTAwqPmf6A+hdrkdM7dBpZVmhl7g+em3ONXlYK/bWPVKqVGMWgP0d1bog8Vc/X6zRwRRQ==} @@ -4589,6 +4592,9 @@ packages: undici-types@7.19.2: resolution: {integrity: sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg==} + undici-types@7.24.6: + resolution: {integrity: sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg==} + unified@11.0.5: resolution: {integrity: sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==} @@ -5729,9 +5735,9 @@ snapshots: '@shikijs/types': 3.23.0 '@shikijs/vscode-textmate': 10.0.2 - '@git.zone/tsbuild@4.4.0': + '@git.zone/tsbuild@4.4.1': dependencies: - '@git.zone/tspublish': 1.11.6 + '@git.zone/tspublish': 1.11.7 '@push.rocks/early': 4.0.4 '@push.rocks/smartcli': 4.0.21 '@push.rocks/smartdelay': 3.1.0 @@ -5841,7 +5847,7 @@ snapshots: - ws - zod - '@git.zone/tsdocker@2.2.6': + '@git.zone/tsdocker@2.3.0': dependencies: '@push.rocks/lik': 6.4.1 '@push.rocks/projectinfo': 5.1.0 @@ -8398,6 +8404,10 @@ snapshots: dependencies: undici-types: 7.19.2 + '@types/node@25.8.0': + dependencies: + undici-types: 7.24.6 + '@types/randomatic@3.1.5': {} '@types/relateurl@0.2.33': {} @@ -10883,6 +10893,8 @@ snapshots: undici-types@7.19.2: {} + undici-types@7.24.6: {} + unified@11.0.5: dependencies: '@types/unist': 3.0.3 diff --git a/ts/classes.server.ts b/ts/classes.server.ts index e658422..83a4e0b 100644 --- a/ts/classes.server.ts +++ b/ts/classes.server.ts @@ -120,6 +120,11 @@ export class CloudlyServer { 'POST', async (ctx) => this.cloudlyRef.baseOsManager.handleHeartbeatHttpRequest(ctx), ); + this.typedServer.addRoute( + '/spark/v1/nodes/heartbeat', + 'POST', + async (ctx) => this.cloudlyRef.nodeManager.handleSparkHeartbeatHttpRequest(ctx), + ); this.typedServer.addRoute( '/baseos/v1/images/:buildId/download', 'GET', diff --git a/ts/manager.jump/classes.jumpmanager.ts b/ts/manager.jump/classes.jumpmanager.ts index 1da98f6..7696ea1 100644 --- a/ts/manager.jump/classes.jumpmanager.ts +++ b/ts/manager.jump/classes.jumpmanager.ts @@ -15,6 +15,7 @@ interface IClaimJumpCodeResponse { accepted: boolean; message?: string; nodeId?: string; + sparkNodeToken?: string; cloudlyUrl?: string; coreflowJumpCode?: string; } @@ -148,8 +149,10 @@ export class CloudlyJumpManager { const nodeId = plugins.smartunique.shortId(8); const now = Date.now(); + const sparkNodeToken = await this.cloudlyRef.authManager.createNewSecureToken(); const node = new this.cloudlyRef.nodeManager.CClusterNode(); node.id = nodeId; + node.sparkNodeTokenHash = this.hashSecret(sparkNodeToken); node.data = { clusterId: cluster.id, nodeType: jumpCodeDoc.data.nodeType, @@ -178,6 +181,7 @@ export class CloudlyJumpManager { return { accepted: true, nodeId: node.id, + sparkNodeToken, cloudlyUrl: cluster.data.cloudlyUrl || `${this.getPublicCloudlyUrl()}/`, coreflowJumpCode, }; @@ -292,8 +296,10 @@ CLAIM_RESPONSE="$(curl -fsSL -X POST "\${CLAIM_URL}" -H 'content-type: applicati export CLAIM_RESPONSE CLOUDLY_URL="$(node -e 'const data = JSON.parse(process.env.CLAIM_RESPONSE); if (!data.accepted) { throw new Error(data.message || "Cloudly rejected jump code"); } process.stdout.write(data.cloudlyUrl);')" COREFLOW_JUMPCODE="$(node -e 'const data = JSON.parse(process.env.CLAIM_RESPONSE); if (!data.coreflowJumpCode) { throw new Error("Cloudly did not return a Coreflow jump code"); } process.stdout.write(data.coreflowJumpCode);')" +SPARK_NODE_ID="$(node -e 'const data = JSON.parse(process.env.CLAIM_RESPONSE); if (!data.nodeId) { throw new Error("Cloudly did not return a Spark node id"); } process.stdout.write(data.nodeId);')" +SPARK_NODE_TOKEN="$(node -e 'const data = JSON.parse(process.env.CLAIM_RESPONSE); if (!data.sparkNodeToken) { throw new Error("Cloudly did not return a Spark node token"); } process.stdout.write(data.sparkNodeToken);')" -spark installdaemon --mode=coreflow-node --cloudlyUrl="\${CLOUDLY_URL}" --jumpcode="\${COREFLOW_JUMPCODE}" +spark installdaemon --mode=coreflow-node --cloudlyUrl="\${CLOUDLY_URL}" --jumpcode="\${COREFLOW_JUMPCODE}" --nodeId="\${SPARK_NODE_ID}" --nodeToken="\${SPARK_NODE_TOKEN}" echo "Cloudly jump completed. This system is now connected." `; diff --git a/ts/manager.node/classes.clusternode.ts b/ts/manager.node/classes.clusternode.ts index 21b8371..76c6c21 100644 --- a/ts/manager.node/classes.clusternode.ts +++ b/ts/manager.node/classes.clusternode.ts @@ -39,6 +39,9 @@ export class ClusterNode extends plugins.smartdata.SmartDataDbDoc< @plugins.smartdata.svDb() public data!: plugins.servezoneInterfaces.data.IClusterNode['data']; + @plugins.smartdata.svDb() + public sparkNodeTokenHash?: string; + constructor() { super(); } @@ -54,6 +57,22 @@ export class ClusterNode extends plugins.smartdata.SmartDataDbDoc< await this.save(); } + public async updateSparkHeartbeat( + metricsArg: plugins.servezoneInterfaces.data.IClusterNodeMetrics, + runtimeInfoArg: Record, + ) { + this.data.metrics = metricsArg; + (this.data as plugins.servezoneInterfaces.data.IClusterNode['data'] & { + sparkRuntimeInfo?: Record; + }).sparkRuntimeInfo = runtimeInfoArg; + this.data.status = 'online'; + this.data.lastHealthCheck = Date.now(); + if (typeof runtimeInfoArg.swarmNodeId === 'string' && runtimeInfoArg.swarmNodeId) { + this.data.swarmNodeId = runtimeInfoArg.swarmNodeId; + } + await this.save(); + } + public async updateStatus(status: plugins.servezoneInterfaces.data.IClusterNode['data']['status']) { this.data.status = status; await this.save(); diff --git a/ts/manager.node/classes.nodemanager.ts b/ts/manager.node/classes.nodemanager.ts index a7c41d4..16d20d7 100644 --- a/ts/manager.node/classes.nodemanager.ts +++ b/ts/manager.node/classes.nodemanager.ts @@ -4,6 +4,18 @@ import { Cluster } from '../manager.cluster/classes.cluster.js'; import { ClusterNode } from './classes.clusternode.js'; import { CurlFresh } from './classes.curlfresh.js'; +interface ISparkHeartbeatRequest { + nodeId?: string; + nodeToken?: string; + metrics?: plugins.servezoneInterfaces.data.IClusterNodeMetrics; + runtimeInfo?: Record; +} + +interface ISparkHeartbeatResponse { + accepted: boolean; + message?: string; +} + export class CloudlyNodeManager { public cloudlyRef: Cloudly; public typedRouter = new plugins.typedrequest.TypedRouter(); @@ -51,6 +63,69 @@ export class CloudlyNodeManager { public async stop() {} + public async handleSparkHeartbeatHttpRequest( + ctxArg: plugins.typedserver.IRequestContext, + ): Promise { + try { + const requestData = await this.readJsonBody(ctxArg); + const response = await this.acceptSparkHeartbeat(requestData); + return this.createJsonResponse(200, response); + } catch (error) { + return this.createJsonResponse(400, { + accepted: false, + message: `Spark heartbeat failed: ${(error as Error).message}`, + } satisfies ISparkHeartbeatResponse); + } + } + + public async acceptSparkHeartbeat( + requestDataArg: ISparkHeartbeatRequest, + ): Promise { + if (!requestDataArg.nodeId) { + return { + accepted: false, + message: 'Spark node id is missing', + }; + } + if (!requestDataArg.nodeToken) { + return { + accepted: false, + message: 'Spark node token is missing', + }; + } + if (!this.isSparkMetrics(requestDataArg.metrics)) { + return { + accepted: false, + message: 'Spark metrics are missing or invalid', + }; + } + if (!this.isSparkRuntimeInfo(requestDataArg.runtimeInfo, requestDataArg.nodeId)) { + return { + accepted: false, + message: 'Spark runtime info is missing or invalid', + }; + } + + const node = await this.CClusterNode.getInstance({ id: requestDataArg.nodeId }); + if (!node) { + return { + accepted: false, + message: 'Spark node is unknown', + }; + } + if (node.sparkNodeTokenHash !== this.hashSecret(requestDataArg.nodeToken)) { + return { + accepted: false, + message: 'Spark node token is invalid', + }; + } + + await node.updateSparkHeartbeat(requestDataArg.metrics, requestDataArg.runtimeInfo); + return { + accepted: true, + }; + } + /** * creates the node infrastructure on hetzner * ensures that there are exactly the resources that are needed @@ -133,4 +208,49 @@ export class CloudlyNodeManager { }); return results; } + + private isSparkMetrics(valueArg: unknown): valueArg is plugins.servezoneInterfaces.data.IClusterNodeMetrics { + if (!valueArg || typeof valueArg !== 'object') { + return false; + } + const metrics = valueArg as Partial; + return typeof metrics.cpuUsagePercent === 'number' + && typeof metrics.memoryUsedMB === 'number' + && typeof metrics.memoryAvailableMB === 'number' + && typeof metrics.diskUsedGB === 'number' + && typeof metrics.diskAvailableGB === 'number' + && typeof metrics.containerCount === 'number' + && typeof metrics.timestamp === 'number'; + } + + private isSparkRuntimeInfo(valueArg: unknown, nodeIdArg: string): valueArg is Record { + if (!valueArg || typeof valueArg !== 'object') { + return false; + } + const runtimeInfo = valueArg as Record; + return runtimeInfo.runtime === 'spark' + && runtimeInfo.nodeId === nodeIdArg + && typeof runtimeInfo.checkedAt === 'number'; + } + + private hashSecret(secretArg: string) { + return plugins.crypto.createHash('sha256').update(secretArg).digest('hex'); + } + + private async readJsonBody(ctxArg: plugins.typedserver.IRequestContext): Promise { + const bodyString = (await ctxArg.text()).trim(); + return bodyString ? JSON.parse(bodyString) as T : {} as T; + } + + private createJsonResponse( + statusCodeArg: number, + bodyArg: object, + ): Response { + return new Response(JSON.stringify(bodyArg), { + status: statusCodeArg, + headers: { + 'Content-Type': 'application/json', + }, + }); + } }