Compare commits

...

3 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
b66140bfd2 refactor: tighten ws batch typing and queue cleanup logging 2026-06-16 22:50:03 +00:00
copilot-swe-agent[bot]
45158ff45b feat: batch redis ws direct messages and dedupe rebuild queue jobs 2026-06-16 22:46:12 +00:00
Owen
06fb3685e4 Add queue 2026-06-11 12:25:57 -07:00
6 changed files with 458 additions and 37 deletions

View File

@@ -24,6 +24,7 @@ import license from "#dynamic/license/license";
import { initLogCleanupInterval } from "@server/lib/cleanupLogs";
import { initAcmeCertSync } from "#dynamic/lib/acmeCertSync";
import { fetchServerIp } from "@server/lib/serverIpService";
import { startRebuildQueueProcessor } from "@server/lib/rebuildClientAssociations";
async function startServers() {
await setHostMeta();
@@ -41,6 +42,7 @@ async function startServers() {
initLogCleanupInterval();
initAcmeCertSync();
startRebuildQueueProcessor();
// Start all servers
const apiServer = createApiServer();

View File

@@ -8,6 +8,7 @@ import {
exitNodes,
newts,
olms,
primaryDb,
roleSiteResources,
Site,
SiteResource,
@@ -40,6 +41,7 @@ import {
removeTargets as removeSubnetProxyTargets
} from "@server/routers/client/targets";
import { lockManager } from "#dynamic/lib/lock";
import { rebuildQueue } from "#dynamic/lib/rebuildQueue";
// TTL for rebuild-association locks. These functions can fan out into many
// peer/proxy updates, so give them a generous window.
@@ -167,11 +169,32 @@ export async function rebuildClientAssociationsFromSiteResource(
subnet: string | null;
}[];
}> {
return await lockManager.withLock(
`rebuild-client-associations:site-resource:${siteResource.siteResourceId}`,
() => rebuildClientAssociationsFromSiteResourceImpl(siteResource, trx),
REBUILD_ASSOCIATIONS_LOCK_TTL_MS
);
try {
return await lockManager.withLock(
`rebuild-client-associations:site-resource:${siteResource.siteResourceId}`,
() =>
rebuildClientAssociationsFromSiteResourceImpl(
siteResource,
trx
),
REBUILD_ASSOCIATIONS_LOCK_TTL_MS
);
} catch (err: any) {
if (
typeof err?.message === "string" &&
err.message.startsWith("Failed to acquire lock")
) {
logger.warn(
`rebuildClientAssociations: could not acquire lock for site resource ${siteResource.siteResourceId}, queuing for deferred processing`
);
await rebuildQueue.enqueue({
type: "site-resource",
id: siteResource.siteResourceId
});
return { mergedAllClients: [] };
}
throw err;
}
}
async function rebuildClientAssociationsFromSiteResourceImpl(
@@ -956,11 +979,28 @@ export async function rebuildClientAssociationsFromClient(
client: Client,
trx: Transaction | typeof db = db
): Promise<void> {
return await lockManager.withLock(
`rebuild-client-associations:client:${client.clientId}`,
() => rebuildClientAssociationsFromClientImpl(client, trx),
REBUILD_ASSOCIATIONS_LOCK_TTL_MS
);
try {
return await lockManager.withLock(
`rebuild-client-associations:client:${client.clientId}`,
() => rebuildClientAssociationsFromClientImpl(client, trx),
REBUILD_ASSOCIATIONS_LOCK_TTL_MS
);
} catch (err: any) {
if (
typeof err?.message === "string" &&
err.message.startsWith("Failed to acquire lock")
) {
logger.warn(
`rebuildClientAssociations: could not acquire lock for client ${client.clientId}, queuing for deferred processing`
);
await rebuildQueue.enqueue({
type: "client",
id: client.clientId
});
return;
}
throw err;
}
}
async function rebuildClientAssociationsFromClientImpl(
@@ -1906,3 +1946,47 @@ export async function cleanupSiteAssociations(
logger.debug(`cleanupSiteAssociations: DONE siteId=${siteId}`);
}
/**
* Start the background rebuild queue processor. This should be called once
* during server startup. Only one server instance at a time will actively
* consume the queue (enforced via a distributed Redis lock); all other
* instances will poll and wait until the lock becomes available.
*/
export function startRebuildQueueProcessor(): void {
rebuildQueue.startProcessing({
onSiteResource: async (siteResourceId: number) => {
const [siteResource] = await primaryDb
.select()
.from(siteResources)
.where(eq(siteResources.siteResourceId, siteResourceId));
if (!siteResource) {
logger.warn(
`Rebuild queue: site resource ${siteResourceId} not found, skipping`
);
return;
}
await rebuildClientAssociationsFromSiteResource(
siteResource,
primaryDb
);
},
onClient: async (clientId: number) => {
const [client] = await primaryDb
.select()
.from(clients)
.where(eq(clients.clientId, clientId));
if (!client) {
logger.warn(
`Rebuild queue: client ${clientId} not found, skipping`
);
return;
}
await rebuildClientAssociationsFromClient(client, primaryDb);
}
});
}

View File

@@ -0,0 +1,23 @@
export type RebuildJobType = "site-resource" | "client";
export interface RebuildJob {
type: RebuildJobType;
id: number;
}
export interface RebuildJobHandlers {
onSiteResource(siteResourceId: number): Promise<void>;
onClient(clientId: number): Promise<void>;
}
export interface RebuildQueueManager {
enqueue(job: RebuildJob): Promise<void>;
startProcessing(handlers: RebuildJobHandlers): void;
}
class NoopRebuildQueue implements RebuildQueueManager {
async enqueue(_job: RebuildJob): Promise<void> {}
startProcessing(_handlers: RebuildJobHandlers): void {}
}
export const rebuildQueue: RebuildQueueManager = new NoopRebuildQueue();

View File

@@ -0,0 +1,198 @@
/*
* This file is part of a proprietary work.
*
* Copyright (c) 2025-2026 Fossorial, Inc.
* All rights reserved.
*
* This file is licensed under the Fossorial Commercial License.
* You may not use this file except in compliance with the License.
* Unauthorized use, copying, modification, or distribution is strictly prohibited.
*
* This file is not licensed under the AGPLv3.
*/
import { redis } from "#private/lib/redis";
import { lockManager } from "#dynamic/lib/lock";
import logger from "@server/logger";
export type RebuildJobType = "site-resource" | "client";
export interface RebuildJob {
type: RebuildJobType;
id: number;
}
export interface RebuildJobHandlers {
onSiteResource(siteResourceId: number): Promise<void>;
onClient(clientId: number): Promise<void>;
}
// Redis list holding pending rebuild jobs (RPUSH to enqueue, LPOP to dequeue — FIFO order).
const QUEUE_KEY = "rebuild-client-associations:queue";
const QUEUED_SET_KEY = "rebuild-client-associations:queued";
// Distributed lock that serialises queue consumption to a single server instance
// at a time. TTL is generous enough to cover a full batch of expensive rebuilds.
const PROCESSOR_LOCK_KEY = "rebuild-client-associations:processor";
// Each rebuild can take up to REBUILD_ASSOCIATIONS_LOCK_TTL_MS (120 s) per
// resource. Allow BATCH_SIZE resources per processor-lock acquisition, plus a
// small buffer.
const BATCH_SIZE = 5;
const PROCESSOR_LOCK_TTL_MS = 120000 * BATCH_SIZE + 30000; // ~630 s
const POLL_INTERVAL_MS = 500;
class RedisRebuildQueue {
private processingStarted = false;
async enqueue(job: RebuildJob): Promise<void> {
if (!redis || redis.status !== "ready") {
logger.warn(
`Rebuild queue: Redis not available — rebuild for ${job.type}:${job.id} will not be retried`
);
return;
}
try {
const dedupeKey = `${job.type}:${job.id}`;
const added = await redis.sadd(QUEUED_SET_KEY, dedupeKey);
if (added === 0) {
logger.debug(
`Rebuild queue: skipped duplicate queued job ${job.type}:${job.id}`
);
return;
}
await redis.rpush(QUEUE_KEY, JSON.stringify(job));
logger.debug(
`Rebuild queue: enqueued ${job.type}:${job.id} (queue position: tail)`
);
} catch (err) {
await redis
.srem(QUEUED_SET_KEY, `${job.type}:${job.id}`)
.catch((cleanupErr) =>
logger.warn(
`Rebuild queue: failed to cleanup dedupe key for ${job.type}:${job.id} after enqueue failure:`,
cleanupErr
)
);
logger.error(
`Rebuild queue: failed to enqueue ${job.type}:${job.id}:`,
err
);
}
}
startProcessing(handlers: RebuildJobHandlers): void {
if (this.processingStarted) return;
this.processingStarted = true;
this.processLoop(handlers).catch((err) => {
logger.error("Rebuild queue processor loop crashed:", err);
});
logger.info("Rebuild queue processor started");
}
private async processLoop(handlers: RebuildJobHandlers): Promise<void> {
while (true) {
try {
await this.tryProcessBatch(handlers);
} catch (err) {
logger.error(
"Rebuild queue: unhandled error in process loop:",
err
);
}
await new Promise((resolve) =>
setTimeout(resolve, POLL_INTERVAL_MS)
);
}
}
private async tryProcessBatch(handlers: RebuildJobHandlers): Promise<void> {
if (!redis || redis.status !== "ready") return;
// Peek before acquiring the processor lock to avoid unnecessary Redis
// round-trips and lock contention when the queue is idle.
const queueLength = await redis.llen(QUEUE_KEY).catch(() => 0);
if (queueLength === 0) return;
try {
await lockManager.withLock(
PROCESSOR_LOCK_KEY,
async () => {
for (let i = 0; i < BATCH_SIZE; i++) {
if (!redis || redis.status !== "ready") break;
const payload = await redis.lpop(QUEUE_KEY);
if (payload === null) break; // queue drained
let job: RebuildJob;
try {
job = JSON.parse(payload) as RebuildJob;
} catch {
logger.error(
`Rebuild queue: could not parse job payload, discarding: ${payload}`
);
continue;
}
// Remove from dedupe set once dequeued so the same job
// can be re-queued while this one is in progress.
await redis
.srem(QUEUED_SET_KEY, `${job.type}:${job.id}`)
.catch((cleanupErr) =>
logger.warn(
`Rebuild queue: failed to remove dedupe key for ${job.type}:${job.id} on dequeue:`,
cleanupErr
)
);
logger.debug(
`Rebuild queue: processing ${job.type}:${job.id}`
);
try {
if (job.type === "site-resource") {
await handlers.onSiteResource(job.id);
} else if (job.type === "client") {
await handlers.onClient(job.id);
} else {
logger.warn(
`Rebuild queue: unknown job type "${(job as any).type}", discarding`
);
}
logger.debug(
`Rebuild queue: completed ${job.type}:${job.id}`
);
} catch (err) {
logger.error(
`Rebuild queue: job ${job.type}:${job.id} threw an error:`,
err
);
}
}
},
PROCESSOR_LOCK_TTL_MS
);
} catch (err: any) {
if (
typeof err?.message === "string" &&
err.message.startsWith("Failed to acquire lock")
) {
// Another server instance currently holds the processor lock and
// is consuming the queue — nothing to do this cycle.
logger.debug(
"Rebuild queue: processor lock held by another instance, skipping this cycle"
);
} else {
throw err;
}
}
}
}
export const rebuildQueue: RedisRebuildQueue = new RedisRebuildQueue();

View File

@@ -187,6 +187,8 @@ const wss: WebSocketServer = new WebSocketServer({ noServer: true });
// Generate unique node ID for this instance
const NODE_ID = uuidv4();
const REDIS_CHANNEL = "websocket_messages";
const REDIS_DIRECT_BATCH_SIZE = 250;
const REDIS_DIRECT_FLUSH_INTERVAL_MS = 10;
// Client tracking map (local to this node)
const connectedClients: Map<string, AuthenticatedWebSocket[]> = new Map();
@@ -197,6 +199,15 @@ const clientConfigVersions: Map<string, number> = new Map();
// Recovery tracking
let isRedisRecoveryInProgress = false;
interface RedisDirectBatchEntry {
targetClientId: string;
message: WSMessage;
resolve: () => void;
}
let pendingRedisDirectMessages: RedisDirectBatchEntry[] = [];
let redisDirectFlushTimer: NodeJS.Timeout | null = null;
// Helper to get map key
const getClientMapKey = (clientId: string) => clientId;
@@ -207,6 +218,78 @@ const getNodeConnectionsKey = (nodeId: string, clientId: string) =>
const getConfigVersionKey = (clientId: string) =>
`ws:configVersion:${clientId}`;
const clearRedisDirectFlushTimer = (): void => {
if (redisDirectFlushTimer) {
clearTimeout(redisDirectFlushTimer);
redisDirectFlushTimer = null;
}
};
const publishDirectBatch = async (
entries: RedisDirectBatchEntry[]
): Promise<void> => {
const redisMessage: RedisMessage = {
type: "direct-batch",
messages: entries.map((entry) => ({
targetClientId: entry.targetClientId,
message: entry.message
})),
fromNodeId: NODE_ID
};
await redisManager.publish(REDIS_CHANNEL, JSON.stringify(redisMessage));
};
const flushPendingRedisDirectMessages = async (): Promise<void> => {
clearRedisDirectFlushTimer();
if (pendingRedisDirectMessages.length === 0) {
return;
}
const entries = pendingRedisDirectMessages;
pendingRedisDirectMessages = [];
if (!redisManager.isRedisEnabled()) {
entries.forEach((entry) => entry.resolve());
return;
}
for (let i = 0; i < entries.length; i += REDIS_DIRECT_BATCH_SIZE) {
const batch = entries.slice(i, i + REDIS_DIRECT_BATCH_SIZE);
try {
await publishDirectBatch(batch);
} catch (error) {
logger.error(
"Failed to send batched direct messages via Redis, messages may be lost:",
error
);
} finally {
batch.forEach((entry) => entry.resolve());
}
}
};
const enqueueRedisDirectMessage = async (
targetClientId: string,
message: WSMessage
): Promise<void> => {
await new Promise<void>((resolve) => {
pendingRedisDirectMessages.push({ targetClientId, message, resolve });
if (pendingRedisDirectMessages.length >= REDIS_DIRECT_BATCH_SIZE) {
void flushPendingRedisDirectMessages();
return;
}
if (!redisDirectFlushTimer) {
redisDirectFlushTimer = setTimeout(() => {
void flushPendingRedisDirectMessages();
}, REDIS_DIRECT_FLUSH_INTERVAL_MS);
}
});
};
// Initialize Redis subscription for cross-node messaging
const initializeRedisSubscription = async (): Promise<void> => {
if (!redisManager.isRedisEnabled()) return;
@@ -227,7 +310,16 @@ const initializeRedisSubscription = async (): Promise<void> => {
// Send to specific client on this node
await sendToClientLocal(
redisMessage.targetClientId,
redisMessage.message
redisMessage.message,
{},
redisMessage.message.configVersion
);
} else if (
redisMessage.type === "direct-batch" &&
redisMessage.messages
) {
await sendRedisDirectBatchToLocalClients(
redisMessage.messages
);
} else if (redisMessage.type === "broadcast") {
// Broadcast to all clients on this node except excluded
@@ -503,7 +595,8 @@ const incrementClientConfigVersion = async (
const sendToClientLocal = async (
clientId: string,
message: WSMessage,
options: SendMessageOptions = {}
options: SendMessageOptions = {},
preResolvedConfigVersion?: number
): Promise<boolean> => {
const mapKey = getClientMapKey(clientId);
const clients = connectedClients.get(mapKey);
@@ -512,7 +605,8 @@ const sendToClientLocal = async (
}
// Handle config version
const configVersion = await getClientConfigVersion(clientId);
const configVersion =
preResolvedConfigVersion ?? (await getClientConfigVersion(clientId));
// Add config version to message
const messageWithVersion = {
@@ -545,6 +639,20 @@ const sendToClientLocal = async (
return true;
};
const sendRedisDirectBatchToLocalClients = async (
entries: { targetClientId: string; message: WSMessage }[]
): Promise<void> => {
const jobs = entries.map((entry) =>
sendToClientLocal(
entry.targetClientId,
entry.message,
{},
entry.message.configVersion
)
);
await Promise.all(jobs);
};
const broadcastToAllExceptLocal = async (
message: WSMessage,
excludeClientId?: string,
@@ -607,23 +715,13 @@ const sendToClient = async (
// Only send via Redis if the client is not connected locally and Redis is enabled
if (!localSent && redisManager.isRedisEnabled()) {
try {
const redisMessage: RedisMessage = {
type: "direct",
targetClientId: clientId,
message: {
...message,
configVersion
},
fromNodeId: NODE_ID
};
await redisManager.publish(
REDIS_CHANNEL,
JSON.stringify(redisMessage)
);
await enqueueRedisDirectMessage(clientId, {
...message,
configVersion
});
} catch (error) {
logger.error(
"Failed to send message via Redis, message may be lost:",
"Failed to queue batched direct message for Redis delivery, message may be lost:",
error
);
// Continue execution - local delivery already attempted
@@ -1109,6 +1207,8 @@ const disconnectClient = async (clientId: string): Promise<boolean> => {
// Cleanup function for graceful shutdown
const cleanup = async (): Promise<void> => {
try {
await flushPendingRedisDirectMessages();
// Close all WebSocket connections
connectedClients.forEach((clients) => {
clients.forEach((client) => {

View File

@@ -76,12 +76,26 @@ export interface SendMessageOptions {
compress?: boolean;
}
// Redis message type for cross-node communication
export interface RedisMessage {
type: "direct" | "broadcast";
targetClientId?: string;
excludeClientId?: string;
message: WSMessage;
fromNodeId: string;
options?: SendMessageOptions;
}
// Redis message types for cross-node communication
export type RedisMessage =
| {
type: "direct";
targetClientId: string;
message: WSMessage;
fromNodeId: string;
}
| {
type: "direct-batch";
messages: {
targetClientId: string;
message: WSMessage;
}[];
fromNodeId: string;
}
| {
type: "broadcast";
excludeClientId?: string;
message: WSMessage;
fromNodeId: string;
options?: SendMessageOptions;
};