mirror of
https://github.com/fosrl/pangolin.git
synced 2026-06-26 09:09:05 +00:00
Add locks to rebuilds
This commit is contained in:
@@ -29,8 +29,11 @@ import { updateResourcePolicies } from "./resourcePolicies";
|
||||
import { BlueprintSource } from "@server/routers/blueprints/types";
|
||||
import { stringify as stringifyYaml } from "yaml";
|
||||
import { generateName } from "@server/db/names";
|
||||
import { handleMessagingForUpdatedSiteResource } from "@server/routers/siteResource";
|
||||
import { rebuildClientAssociationsFromSiteResource } from "../rebuildClientAssociations";
|
||||
import {
|
||||
handleMessagingForUpdatedSiteResource,
|
||||
rebuildClientAssociationsFromSiteResource,
|
||||
waitForSiteResourceRebuildIdle
|
||||
} from "../rebuildClientAssociations";
|
||||
|
||||
type ApplyBlueprintArgs = {
|
||||
orgId: string;
|
||||
@@ -138,26 +141,25 @@ export async function applyBlueprint({
|
||||
for (const result of privateResourcesResults) {
|
||||
rebuildClientAssociationsFromSiteResource(
|
||||
result.newSiteResource
|
||||
).catch((e) => {
|
||||
logger.error(
|
||||
`Failed to rebuild client associations for site resource ${result.newSiteResource.siteResourceId}. Error: ${e}`
|
||||
);
|
||||
});
|
||||
|
||||
handleMessagingForUpdatedSiteResource(
|
||||
result.oldSiteResource,
|
||||
result.newSiteResource,
|
||||
result.oldSites.map((site) => ({
|
||||
// only need to run this on the old sites because the new sites are added above
|
||||
siteId: site.siteId,
|
||||
orgId: result.newSiteResource.orgId
|
||||
}))
|
||||
).catch((err) => {
|
||||
logger.error(
|
||||
`Error handling messaging for updated site resource ${result.newSiteResource.siteResourceId}:`,
|
||||
err
|
||||
);
|
||||
});
|
||||
)
|
||||
.then(() =>
|
||||
waitForSiteResourceRebuildIdle(
|
||||
result.newSiteResource.siteResourceId
|
||||
)
|
||||
)
|
||||
.then(() =>
|
||||
handleMessagingForUpdatedSiteResource(
|
||||
result.oldSiteResource,
|
||||
result.newSiteResource,
|
||||
result.oldSites.map((s) => s.siteId),
|
||||
result.newSites.map((s) => s.siteId)
|
||||
)
|
||||
)
|
||||
.catch((e) => {
|
||||
logger.error(
|
||||
`Failed to rebuild and handle messaging for site resource ${result.newSiteResource.siteResourceId}. Error: ${e}`
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
|
||||
@@ -35,7 +35,7 @@ export class LockManager {
|
||||
ttl: number;
|
||||
owner?: string;
|
||||
}> {
|
||||
return { exists: true, ownedByMe: true, ttl: 0 };
|
||||
return { exists: false, ownedByMe: false, ttl: 0 };
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -49,6 +49,112 @@ import { rebuildQueue } from "#dynamic/lib/rebuildQueue";
|
||||
// peer/proxy updates, so give them a generous window.
|
||||
const REBUILD_ASSOCIATIONS_LOCK_TTL_MS = 120000;
|
||||
|
||||
const REBUILD_IDLE_POLL_INTERVAL_MS = 300;
|
||||
const REBUILD_IDLE_DEFAULT_TIMEOUT_MS = 130_000; // slightly longer than lock TTL
|
||||
const REBUILD_IDLE_HANDLER_TIMEOUT_MS = 5_000;
|
||||
|
||||
/**
|
||||
* Returns true if a rebuild for the given site resource is currently active
|
||||
* (holding the distributed lock) or is pending in the rebuild queue.
|
||||
*/
|
||||
export async function hasActiveSiteResourceRebuild(
|
||||
siteResourceId: number
|
||||
): Promise<boolean> {
|
||||
const lockKey = `rebuild-client-associations:site-resource:${siteResourceId}`;
|
||||
const lockInfo = await lockManager.getLockInfo(lockKey);
|
||||
if (lockInfo.exists) return true;
|
||||
return rebuildQueue.isQueued({ type: "site-resource", id: siteResourceId });
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves once there is no active or queued rebuild for the given site resource.
|
||||
* Logs a warning and resolves early if the timeout is reached.
|
||||
*/
|
||||
export async function waitForSiteResourceRebuildIdle(
|
||||
siteResourceId: number,
|
||||
timeoutMs = REBUILD_IDLE_DEFAULT_TIMEOUT_MS
|
||||
): Promise<void> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
while (Date.now() < deadline) {
|
||||
if (!(await hasActiveSiteResourceRebuild(siteResourceId))) return;
|
||||
await new Promise<void>((r) =>
|
||||
setTimeout(r, REBUILD_IDLE_POLL_INTERVAL_MS)
|
||||
);
|
||||
}
|
||||
logger.warn(
|
||||
`waitForSiteResourceRebuildIdle: timed out after ${timeoutMs}ms waiting for siteResourceId=${siteResourceId}`
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves once there are no active or queued rebuilds for any site resource
|
||||
* associated with the given site.
|
||||
*/
|
||||
export async function waitForSiteRebuildIdle(
|
||||
siteId: number,
|
||||
timeoutMs = REBUILD_IDLE_HANDLER_TIMEOUT_MS
|
||||
): Promise<void> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
while (Date.now() < deadline) {
|
||||
const resourceRows = await db
|
||||
.select({ siteResourceId: siteResources.siteResourceId })
|
||||
.from(siteResources)
|
||||
.innerJoin(
|
||||
siteNetworks,
|
||||
eq(siteNetworks.networkId, siteResources.networkId)
|
||||
)
|
||||
.where(eq(siteNetworks.siteId, siteId));
|
||||
let allIdle = true;
|
||||
for (const { siteResourceId } of resourceRows) {
|
||||
if (await hasActiveSiteResourceRebuild(siteResourceId)) {
|
||||
allIdle = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allIdle) return;
|
||||
await new Promise<void>((r) =>
|
||||
setTimeout(r, REBUILD_IDLE_POLL_INTERVAL_MS)
|
||||
);
|
||||
}
|
||||
logger.warn(
|
||||
`waitForSiteRebuildIdle: timed out after ${timeoutMs}ms waiting for siteId=${siteId}`
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves once there are no active or queued rebuilds for any site resource
|
||||
* associated with the given client.
|
||||
*/
|
||||
export async function waitForClientRebuildIdle(
|
||||
clientId: number,
|
||||
timeoutMs = REBUILD_IDLE_HANDLER_TIMEOUT_MS
|
||||
): Promise<void> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
while (Date.now() < deadline) {
|
||||
const resourceRows = await db
|
||||
.select({
|
||||
siteResourceId:
|
||||
clientSiteResourcesAssociationsCache.siteResourceId
|
||||
})
|
||||
.from(clientSiteResourcesAssociationsCache)
|
||||
.where(eq(clientSiteResourcesAssociationsCache.clientId, clientId));
|
||||
let allIdle = true;
|
||||
for (const { siteResourceId } of resourceRows) {
|
||||
if (await hasActiveSiteResourceRebuild(siteResourceId)) {
|
||||
allIdle = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allIdle) return;
|
||||
await new Promise<void>((r) =>
|
||||
setTimeout(r, REBUILD_IDLE_POLL_INTERVAL_MS)
|
||||
);
|
||||
}
|
||||
logger.warn(
|
||||
`waitForClientRebuildIdle: timed out after ${timeoutMs}ms waiting for clientId=${clientId}`
|
||||
);
|
||||
}
|
||||
|
||||
export async function getClientSiteResourceAccess(
|
||||
siteResource: SiteResource,
|
||||
trx: Transaction | typeof db = db
|
||||
@@ -1060,6 +1166,8 @@ export async function handleMessagingForUpdatedSiteResource(
|
||||
);
|
||||
|
||||
// get all of the clients from the cache
|
||||
const { mergedAllClients, mergedAllClientIds } =
|
||||
await getClientSiteResourceAccess(updatedSiteResource, trx);
|
||||
|
||||
const targets = await generateSubnetProxyTargetV2(
|
||||
updatedSiteResource,
|
||||
|
||||
@@ -13,11 +13,15 @@ export interface RebuildJobHandlers {
|
||||
export interface RebuildQueueManager {
|
||||
enqueue(job: RebuildJob): Promise<void>;
|
||||
startProcessing(handlers: RebuildJobHandlers): void;
|
||||
isQueued(job: RebuildJob): Promise<boolean>;
|
||||
}
|
||||
|
||||
class NoopRebuildQueue implements RebuildQueueManager {
|
||||
async enqueue(_job: RebuildJob): Promise<void> {}
|
||||
startProcessing(_handlers: RebuildJobHandlers): void {}
|
||||
async isQueued(_job: RebuildJob): Promise<boolean> {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export const rebuildQueue: RebuildQueueManager = new NoopRebuildQueue();
|
||||
|
||||
@@ -46,6 +46,17 @@ const POLL_INTERVAL_MS = 500;
|
||||
class RedisRebuildQueue {
|
||||
private processingStarted = false;
|
||||
|
||||
async isQueued(job: RebuildJob): Promise<boolean> {
|
||||
if (!redis || redis.status !== "ready") return false;
|
||||
const dedupeKey = `${job.type}:${job.id}`;
|
||||
try {
|
||||
const member = await redis.sismember(QUEUED_SET_KEY, dedupeKey);
|
||||
return member === 1;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async enqueue(job: RebuildJob): Promise<void> {
|
||||
if (!redis || redis.status !== "ready") {
|
||||
logger.warn(
|
||||
|
||||
@@ -9,6 +9,7 @@ import { buildClientConfigurationForNewtClient } from "./buildConfiguration";
|
||||
import { convertTargetsIfNecessary } from "../client/targets";
|
||||
import { canCompress } from "@server/lib/clientVersionChecks";
|
||||
import config from "@server/lib/config";
|
||||
import { waitForSiteRebuildIdle } from "@server/lib/rebuildClientAssociations";
|
||||
|
||||
export const handleNewtGetConfigMessage: MessageHandler = async (context) => {
|
||||
const { message, client, sendToClient } = context;
|
||||
@@ -61,6 +62,8 @@ export const handleNewtGetConfigMessage: MessageHandler = async (context) => {
|
||||
return;
|
||||
}
|
||||
|
||||
await waitForSiteRebuildIdle(siteId);
|
||||
|
||||
// update the endpoint and the public key
|
||||
const [site] = await db
|
||||
.update(sites)
|
||||
|
||||
@@ -21,6 +21,7 @@ import { build } from "@server/build";
|
||||
import { canCompress } from "@server/lib/clientVersionChecks";
|
||||
import config from "@server/lib/config";
|
||||
import cache from "#dynamic/lib/cache"; // not using regional here because we need this in the register message handler before we know where the client is
|
||||
import { waitForClientRebuildIdle } from "@server/lib/rebuildClientAssociations";
|
||||
|
||||
const HOLEPUNCH_STALE_CHAIN_THRESHOLD = 18;
|
||||
const HOLEPUNCH_STALE_CHAIN_TTL_SECONDS = 1800;
|
||||
@@ -385,6 +386,8 @@ export const handleOlmRegisterMessage: MessageHandler = async (context) => {
|
||||
}
|
||||
|
||||
// NOTE: its important that the client here is the old client and the public key is the new key
|
||||
await waitForClientRebuildIdle(olm.clientId);
|
||||
|
||||
const siteConfigurations = await buildSiteConfigurationForOlmClient(
|
||||
client,
|
||||
publicKey,
|
||||
|
||||
@@ -17,7 +17,11 @@ import response from "@server/lib/response";
|
||||
import { eq, and, ne, inArray } from "drizzle-orm";
|
||||
import { OpenAPITags, registry } from "@server/openApi";
|
||||
import { isIpInCidr, portRangeStringSchema } from "@server/lib/ip";
|
||||
import { rebuildClientAssociationsFromSiteResource } from "@server/lib/rebuildClientAssociations";
|
||||
import {
|
||||
handleMessagingForUpdatedSiteResource,
|
||||
rebuildClientAssociationsFromSiteResource,
|
||||
waitForSiteResourceRebuildIdle
|
||||
} from "@server/lib/rebuildClientAssociations";
|
||||
import logger from "@server/logger";
|
||||
import HttpCode from "@server/types/HttpCode";
|
||||
import { NextFunction, Request, Response } from "express";
|
||||
@@ -592,24 +596,27 @@ export async function updateSiteResource(
|
||||
throw new Error("No updated resource found after update");
|
||||
}
|
||||
|
||||
rebuildClientAssociationsFromSiteResource(updatedSiteResource).catch(
|
||||
(e) => {
|
||||
logger.error(
|
||||
`Failed to rebuild client associations for site resource ${siteResourceId}. Error: ${e}`
|
||||
);
|
||||
}
|
||||
);
|
||||
const finalUpdatedSiteResource = updatedSiteResource;
|
||||
|
||||
handleMessagingForUpdatedSiteResource(
|
||||
existingSiteResource,
|
||||
updatedSiteResource,
|
||||
existingSiteIds,
|
||||
updatedSiteIds
|
||||
).catch((e) => {
|
||||
logger.error(
|
||||
`Failed to handle messaging for updated site resource ${siteResourceId}. Error: ${e}`
|
||||
);
|
||||
});
|
||||
rebuildClientAssociationsFromSiteResource(finalUpdatedSiteResource)
|
||||
.then(() =>
|
||||
waitForSiteResourceRebuildIdle(
|
||||
finalUpdatedSiteResource.siteResourceId
|
||||
)
|
||||
)
|
||||
.then(() =>
|
||||
handleMessagingForUpdatedSiteResource(
|
||||
existingSiteResource,
|
||||
finalUpdatedSiteResource,
|
||||
existingSiteIds,
|
||||
updatedSiteIds
|
||||
)
|
||||
)
|
||||
.catch((e) => {
|
||||
logger.error(
|
||||
`Failed to rebuild and handle messaging for site resource ${siteResourceId}. Error: ${e}`
|
||||
);
|
||||
});
|
||||
|
||||
return response(res, {
|
||||
data: updatedSiteResource,
|
||||
|
||||
Reference in New Issue
Block a user