Add locks to rebuilds

This commit is contained in:
Owen
2026-06-24 13:50:33 -04:00
parent 034bcbd271
commit 80b66cf9b9
8 changed files with 179 additions and 41 deletions

View File

@@ -29,8 +29,11 @@ import { updateResourcePolicies } from "./resourcePolicies";
import { BlueprintSource } from "@server/routers/blueprints/types";
import { stringify as stringifyYaml } from "yaml";
import { generateName } from "@server/db/names";
import { handleMessagingForUpdatedSiteResource } from "@server/routers/siteResource";
import { rebuildClientAssociationsFromSiteResource } from "../rebuildClientAssociations";
import {
handleMessagingForUpdatedSiteResource,
rebuildClientAssociationsFromSiteResource,
waitForSiteResourceRebuildIdle
} from "../rebuildClientAssociations";
type ApplyBlueprintArgs = {
orgId: string;
@@ -138,26 +141,25 @@ export async function applyBlueprint({
for (const result of privateResourcesResults) {
rebuildClientAssociationsFromSiteResource(
result.newSiteResource
).catch((e) => {
logger.error(
`Failed to rebuild client associations for site resource ${result.newSiteResource.siteResourceId}. Error: ${e}`
);
});
handleMessagingForUpdatedSiteResource(
result.oldSiteResource,
result.newSiteResource,
result.oldSites.map((site) => ({
// only need to run this on the old sites because the new sites are added above
siteId: site.siteId,
orgId: result.newSiteResource.orgId
}))
).catch((err) => {
logger.error(
`Error handling messaging for updated site resource ${result.newSiteResource.siteResourceId}:`,
err
);
});
)
.then(() =>
waitForSiteResourceRebuildIdle(
result.newSiteResource.siteResourceId
)
)
.then(() =>
handleMessagingForUpdatedSiteResource(
result.oldSiteResource,
result.newSiteResource,
result.oldSites.map((s) => s.siteId),
result.newSites.map((s) => s.siteId)
)
)
.catch((e) => {
logger.error(
`Failed to rebuild and handle messaging for site resource ${result.newSiteResource.siteResourceId}. Error: ${e}`
);
});
}
logger.debug(

View File

@@ -35,7 +35,7 @@ export class LockManager {
ttl: number;
owner?: string;
}> {
return { exists: true, ownedByMe: true, ttl: 0 };
return { exists: false, ownedByMe: false, ttl: 0 };
}
/**

View File

@@ -49,6 +49,112 @@ import { rebuildQueue } from "#dynamic/lib/rebuildQueue";
// peer/proxy updates, so give them a generous window.
const REBUILD_ASSOCIATIONS_LOCK_TTL_MS = 120000;
const REBUILD_IDLE_POLL_INTERVAL_MS = 300;
const REBUILD_IDLE_DEFAULT_TIMEOUT_MS = 130_000; // slightly longer than lock TTL
const REBUILD_IDLE_HANDLER_TIMEOUT_MS = 5_000;
/**
* Returns true if a rebuild for the given site resource is currently active
* (holding the distributed lock) or is pending in the rebuild queue.
*/
export async function hasActiveSiteResourceRebuild(
siteResourceId: number
): Promise<boolean> {
const lockKey = `rebuild-client-associations:site-resource:${siteResourceId}`;
const lockInfo = await lockManager.getLockInfo(lockKey);
if (lockInfo.exists) return true;
return rebuildQueue.isQueued({ type: "site-resource", id: siteResourceId });
}
/**
* Resolves once there is no active or queued rebuild for the given site resource.
* Logs a warning and resolves early if the timeout is reached.
*/
export async function waitForSiteResourceRebuildIdle(
siteResourceId: number,
timeoutMs = REBUILD_IDLE_DEFAULT_TIMEOUT_MS
): Promise<void> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
if (!(await hasActiveSiteResourceRebuild(siteResourceId))) return;
await new Promise<void>((r) =>
setTimeout(r, REBUILD_IDLE_POLL_INTERVAL_MS)
);
}
logger.warn(
`waitForSiteResourceRebuildIdle: timed out after ${timeoutMs}ms waiting for siteResourceId=${siteResourceId}`
);
}
/**
* Resolves once there are no active or queued rebuilds for any site resource
* associated with the given site.
*/
export async function waitForSiteRebuildIdle(
siteId: number,
timeoutMs = REBUILD_IDLE_HANDLER_TIMEOUT_MS
): Promise<void> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const resourceRows = await db
.select({ siteResourceId: siteResources.siteResourceId })
.from(siteResources)
.innerJoin(
siteNetworks,
eq(siteNetworks.networkId, siteResources.networkId)
)
.where(eq(siteNetworks.siteId, siteId));
let allIdle = true;
for (const { siteResourceId } of resourceRows) {
if (await hasActiveSiteResourceRebuild(siteResourceId)) {
allIdle = false;
break;
}
}
if (allIdle) return;
await new Promise<void>((r) =>
setTimeout(r, REBUILD_IDLE_POLL_INTERVAL_MS)
);
}
logger.warn(
`waitForSiteRebuildIdle: timed out after ${timeoutMs}ms waiting for siteId=${siteId}`
);
}
/**
* Resolves once there are no active or queued rebuilds for any site resource
* associated with the given client.
*/
export async function waitForClientRebuildIdle(
clientId: number,
timeoutMs = REBUILD_IDLE_HANDLER_TIMEOUT_MS
): Promise<void> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const resourceRows = await db
.select({
siteResourceId:
clientSiteResourcesAssociationsCache.siteResourceId
})
.from(clientSiteResourcesAssociationsCache)
.where(eq(clientSiteResourcesAssociationsCache.clientId, clientId));
let allIdle = true;
for (const { siteResourceId } of resourceRows) {
if (await hasActiveSiteResourceRebuild(siteResourceId)) {
allIdle = false;
break;
}
}
if (allIdle) return;
await new Promise<void>((r) =>
setTimeout(r, REBUILD_IDLE_POLL_INTERVAL_MS)
);
}
logger.warn(
`waitForClientRebuildIdle: timed out after ${timeoutMs}ms waiting for clientId=${clientId}`
);
}
export async function getClientSiteResourceAccess(
siteResource: SiteResource,
trx: Transaction | typeof db = db
@@ -1060,6 +1166,8 @@ export async function handleMessagingForUpdatedSiteResource(
);
// get all of the clients from the cache
const { mergedAllClients, mergedAllClientIds } =
await getClientSiteResourceAccess(updatedSiteResource, trx);
const targets = await generateSubnetProxyTargetV2(
updatedSiteResource,

View File

@@ -13,11 +13,15 @@ export interface RebuildJobHandlers {
export interface RebuildQueueManager {
enqueue(job: RebuildJob): Promise<void>;
startProcessing(handlers: RebuildJobHandlers): void;
isQueued(job: RebuildJob): Promise<boolean>;
}
class NoopRebuildQueue implements RebuildQueueManager {
async enqueue(_job: RebuildJob): Promise<void> {}
startProcessing(_handlers: RebuildJobHandlers): void {}
async isQueued(_job: RebuildJob): Promise<boolean> {
return false;
}
}
export const rebuildQueue: RebuildQueueManager = new NoopRebuildQueue();

View File

@@ -46,6 +46,17 @@ const POLL_INTERVAL_MS = 500;
class RedisRebuildQueue {
private processingStarted = false;
async isQueued(job: RebuildJob): Promise<boolean> {
if (!redis || redis.status !== "ready") return false;
const dedupeKey = `${job.type}:${job.id}`;
try {
const member = await redis.sismember(QUEUED_SET_KEY, dedupeKey);
return member === 1;
} catch {
return false;
}
}
async enqueue(job: RebuildJob): Promise<void> {
if (!redis || redis.status !== "ready") {
logger.warn(

View File

@@ -9,6 +9,7 @@ import { buildClientConfigurationForNewtClient } from "./buildConfiguration";
import { convertTargetsIfNecessary } from "../client/targets";
import { canCompress } from "@server/lib/clientVersionChecks";
import config from "@server/lib/config";
import { waitForSiteRebuildIdle } from "@server/lib/rebuildClientAssociations";
export const handleNewtGetConfigMessage: MessageHandler = async (context) => {
const { message, client, sendToClient } = context;
@@ -61,6 +62,8 @@ export const handleNewtGetConfigMessage: MessageHandler = async (context) => {
return;
}
await waitForSiteRebuildIdle(siteId);
// update the endpoint and the public key
const [site] = await db
.update(sites)

View File

@@ -21,6 +21,7 @@ import { build } from "@server/build";
import { canCompress } from "@server/lib/clientVersionChecks";
import config from "@server/lib/config";
import cache from "#dynamic/lib/cache"; // not using regional here because we need this in the register message handler before we know where the client is
import { waitForClientRebuildIdle } from "@server/lib/rebuildClientAssociations";
const HOLEPUNCH_STALE_CHAIN_THRESHOLD = 18;
const HOLEPUNCH_STALE_CHAIN_TTL_SECONDS = 1800;
@@ -385,6 +386,8 @@ export const handleOlmRegisterMessage: MessageHandler = async (context) => {
}
// NOTE: its important that the client here is the old client and the public key is the new key
await waitForClientRebuildIdle(olm.clientId);
const siteConfigurations = await buildSiteConfigurationForOlmClient(
client,
publicKey,

View File

@@ -17,7 +17,11 @@ import response from "@server/lib/response";
import { eq, and, ne, inArray } from "drizzle-orm";
import { OpenAPITags, registry } from "@server/openApi";
import { isIpInCidr, portRangeStringSchema } from "@server/lib/ip";
import { rebuildClientAssociationsFromSiteResource } from "@server/lib/rebuildClientAssociations";
import {
handleMessagingForUpdatedSiteResource,
rebuildClientAssociationsFromSiteResource,
waitForSiteResourceRebuildIdle
} from "@server/lib/rebuildClientAssociations";
import logger from "@server/logger";
import HttpCode from "@server/types/HttpCode";
import { NextFunction, Request, Response } from "express";
@@ -592,24 +596,27 @@ export async function updateSiteResource(
throw new Error("No updated resource found after update");
}
rebuildClientAssociationsFromSiteResource(updatedSiteResource).catch(
(e) => {
logger.error(
`Failed to rebuild client associations for site resource ${siteResourceId}. Error: ${e}`
);
}
);
const finalUpdatedSiteResource = updatedSiteResource;
handleMessagingForUpdatedSiteResource(
existingSiteResource,
updatedSiteResource,
existingSiteIds,
updatedSiteIds
).catch((e) => {
logger.error(
`Failed to handle messaging for updated site resource ${siteResourceId}. Error: ${e}`
);
});
rebuildClientAssociationsFromSiteResource(finalUpdatedSiteResource)
.then(() =>
waitForSiteResourceRebuildIdle(
finalUpdatedSiteResource.siteResourceId
)
)
.then(() =>
handleMessagingForUpdatedSiteResource(
existingSiteResource,
finalUpdatedSiteResource,
existingSiteIds,
updatedSiteIds
)
)
.catch((e) => {
logger.error(
`Failed to rebuild and handle messaging for site resource ${siteResourceId}. Error: ${e}`
);
});
return response(res, {
data: updatedSiteResource,