import * as plugins from '../plugins.js'; import * as paths from '../paths.js'; import { LawRecord } from './classes.lawrecord.js'; import type { ILawLookupRequest, ILawSearchRequest, ILawServiceConfig, ILawSyncRequest, ILawSyncResult, TJurisdiction, TLawSource, TRawLawFormat, TUsLawCollection, } from './interfaces.law.js'; interface IStoredLawData { jurisdiction: TJurisdiction; source: TLawSource; identifier: string; title: string; shortTitle?: string; citation?: string; type?: string; language?: string; sourceUrl: string; rawFormat: TRawLawFormat; rawBody: string; text: string; dateIssued?: string; lastModified?: string; sourceMeta?: Record; } interface IGermanyLawTocEntry { title: string; xmlZipUrl: string; slug: string; } interface IEuLawMetadata { celex: string; title: string; type: string; dateIssued: string; eli: string; lastModified: string; language: string; } interface ISparqlJsonResult { results?: { bindings?: Array>; }; } interface IGovInfoCollectionResponse { nextPage?: string; packages?: Array<{ packageId: string; }>; } interface IGovInfoSummary { packageId: string; collectionCode: string; title?: string; shortTitle?: Array<{ title: string; }>; detailsLink?: string; dateIssued?: string; lastModified?: string; titleNumber?: string; congress?: string; documentNumber?: string; documentType?: string; download?: { txtLink?: string; uslmLink?: string; }; } interface IEuPageContent { html: string; text: string; title: string; dateIssued: string; type: string; eli: string; } interface IUsCodeCitation { titleNumber: string; sectionNumber: string; subsectionPath: string; canonicalIdentifier: string; canonicalCitation: string; sourceUrl: string; } export class LawService { public db!: plugins.smartdata.SmartdataDb; public CLawRecord = plugins.smartdata.setDefaultManagerForDoc(this, LawRecord); private localSmartDb?: plugins.smartdb.LocalSmartDb; private smartBrowser?: plugins.smartbrowser.SmartBrowser; private browserStarted = false; private started = false; private config: Required>; constructor(configArg: ILawServiceConfig = {}) { this.config = { dbFolderPath: configArg.dbFolderPath ?? plugins.path.join(paths.packageDir, '.nogit', 'law-smartdb'), dbName: configArg.dbName ?? 'laws', govInfoApiKey: configArg.govInfoApiKey ?? 'DEMO_KEY', }; } public async start() { if (this.started) { return; } await plugins.smartfs.directory(this.config.dbFolderPath).create(); this.localSmartDb = new plugins.smartdb.LocalSmartDb({ folderPath: this.config.dbFolderPath, }); const connectionInfo = await this.localSmartDb.start(); this.db = new plugins.smartdata.SmartdataDb({ mongoDbUrl: connectionInfo.connectionUri, mongoDbName: this.config.dbName, }); await this.db.init(); await this.db.mongoDb.collection('_opendata_bootstrap').insertOne({ createdAt: new Date(), }); await this.db.mongoDb.collection('_opendata_bootstrap').deleteMany({}); this.started = true; } public async stop() { if (this.browserStarted && this.smartBrowser) { await this.smartBrowser.stop(); this.browserStarted = false; } if (this.started) { await this.db.close(); await this.localSmartDb?.stop(); this.started = false; } } public async syncLaw(requestArg: ILawLookupRequest): Promise { await this.ensureStarted(); switch (requestArg.jurisdiction) { case 'de': return this.syncGermanyLaw(requestArg.identifier); case 'eu': return this.syncEuLaw(requestArg.identifier, requestArg.language ?? 'EN'); case 'us': return this.syncUsLaw( requestArg.identifier, requestArg.usCollection, this.config.govInfoApiKey ); default: throw new Error(`Unsupported jurisdiction: ${requestArg.jurisdiction}`); } } public async getLaw(requestArg: ILawLookupRequest): Promise { await this.ensureStarted(); const lookupKey = this.createLookupKey(requestArg.jurisdiction, requestArg.identifier); const existingLaw = await this.getLawByLookupKey(lookupKey); if (existingLaw && !requestArg.forceSync) { return existingLaw; } return this.syncLaw(requestArg); } public async searchLaws(requestArg: ILawSearchRequest): Promise { await this.ensureStarted(); const limit = requestArg.limit ?? 20; const baseFilter = requestArg.jurisdiction ? ({ jurisdiction: requestArg.jurisdiction } as Record) : undefined; try { const results = await this.CLawRecord.search( requestArg.query, baseFilter ? { filter: baseFilter } : undefined ); return results.slice(0, limit); } catch { const regex = new RegExp(this.escapeRegex(requestArg.query), 'i'); const searchFilter = { $or: [ { identifier: { $regex: regex } }, { title: { $regex: regex } }, { shortTitle: { $regex: regex } }, { citation: { $regex: regex } }, { text: { $regex: regex } }, ], }; const mongoFilter = baseFilter ? { $and: [baseFilter, searchFilter], } : searchFilter; const results = await this.CLawRecord.getInstances(mongoFilter as any); return results.slice(0, limit); } } public async syncLaws(requestArg: ILawSyncRequest): Promise { await this.ensureStarted(); switch (requestArg.jurisdiction) { case 'de': return this.syncGermanyLaws(requestArg); case 'eu': return this.syncEuLaws(requestArg); case 'us': return this.syncUsLaws(requestArg); default: throw new Error(`Unsupported jurisdiction: ${requestArg.jurisdiction}`); } } private async ensureStarted() { if (!this.started) { await this.start(); } } private async ensureBrowser() { if (!this.smartBrowser) { this.smartBrowser = new plugins.smartbrowser.SmartBrowser(); } if (!this.browserStarted) { await this.smartBrowser.start(); this.browserStarted = true; } return this.smartBrowser; } private createLookupKey(jurisdictionArg: TJurisdiction, identifierArg: string) { return `${jurisdictionArg}:${this.normalizeIdentifierForLookup(jurisdictionArg, identifierArg)}`; } private normalizeIdentifierForLookup(jurisdictionArg: TJurisdiction, identifierArg: string) { switch (jurisdictionArg) { case 'de': return this.normalizeGermanySlug(identifierArg); case 'eu': return identifierArg.trim().replace(/^CELEX:/i, '').toUpperCase(); case 'us': { const usCodeCitation = this.parseUsCodeCitation(identifierArg); return usCodeCitation ? usCodeCitation.canonicalIdentifier : identifierArg.trim().toUpperCase(); } default: return identifierArg.trim(); } } private async getLawByLookupKey(lookupKeyArg: string) { return LawRecord.getByLookupKey(lookupKeyArg); } private async upsertLaw(dataArg: IStoredLawData): Promise { const lookupKey = this.createLookupKey(dataArg.jurisdiction, dataArg.identifier); let lawRecord = await this.getLawByLookupKey(lookupKey); if (!lawRecord) { lawRecord = new this.CLawRecord(); lawRecord.id = await this.CLawRecord.getNewId(); lawRecord.lookupKey = lookupKey; } lawRecord.jurisdiction = dataArg.jurisdiction; lawRecord.source = dataArg.source; lawRecord.identifier = dataArg.identifier; lawRecord.title = dataArg.title; lawRecord.shortTitle = dataArg.shortTitle ?? ''; lawRecord.citation = dataArg.citation ?? ''; lawRecord.type = dataArg.type ?? ''; lawRecord.language = dataArg.language ?? ''; lawRecord.sourceUrl = dataArg.sourceUrl; lawRecord.rawFormat = dataArg.rawFormat; lawRecord.rawBody = dataArg.rawBody; lawRecord.text = dataArg.text; lawRecord.dateIssued = dataArg.dateIssued ?? ''; lawRecord.lastModified = dataArg.lastModified ?? ''; lawRecord.sourceMeta = dataArg.sourceMeta ?? {}; lawRecord.fetchedAt = new Date(); lawRecord.syncedAt = new Date(); await lawRecord.save(); return lawRecord; } private async syncGermanyLaws(requestArg: ILawSyncRequest): Promise { const tocXml = await this.fetchText('https://www.gesetze-im-internet.de/gii-toc.xml'); const tocEntries = this.parseGermanyToc(tocXml); const offset = requestArg.offset ?? 0; const targetEntries = typeof requestArg.limit === 'number' ? tocEntries.slice(offset, offset + requestArg.limit) : tocEntries.slice(offset); const identifiers: string[] = []; for (const entry of targetEntries) { const syncedLaw = await this.syncGermanyLaw(entry); identifiers.push(syncedLaw.identifier); } return { jurisdiction: 'de', syncedCount: identifiers.length, identifiers, }; } private async syncGermanyLaw(identifierOrEntryArg: string | IGermanyLawTocEntry): Promise { const germanyEntry = typeof identifierOrEntryArg === 'string' ? { slug: this.normalizeGermanySlug(identifierOrEntryArg), title: '', xmlZipUrl: `https://www.gesetze-im-internet.de/${this.normalizeGermanySlug(identifierOrEntryArg)}/xml.zip`, } : identifierOrEntryArg; const extractedFiles = await plugins.smartarchive.SmartArchive.create() .url(germanyEntry.xmlZipUrl) .toSmartFiles(); const xmlFile = extractedFiles.find((fileArg: plugins.smartfile.SmartFile) => fileArg.relative.toLowerCase().endsWith('.xml') ) ?? extractedFiles[0]; if (!xmlFile) { throw new Error(`No XML file found for German law ${germanyEntry.slug}`); } const xmlBody = xmlFile.parseContentAsString('utf8'); const title = this.extractTagValue(xmlBody, 'langue') || germanyEntry.title || germanyEntry.slug; const citation = this.extractTagValue(xmlBody, 'jurabk') || germanyEntry.slug.toUpperCase(); const dateIssued = this.extractTagValue(xmlBody, 'ausfertigung-datum'); return this.upsertLaw({ jurisdiction: 'de', source: 'gesetze-im-internet', identifier: germanyEntry.slug, title, shortTitle: citation, citation, type: 'law', language: 'de', sourceUrl: germanyEntry.xmlZipUrl, rawFormat: 'xml', rawBody: xmlBody, text: this.markupToText(xmlBody), dateIssued, sourceMeta: { slug: germanyEntry.slug, }, }); } private parseGermanyToc(tocXmlArg: string): IGermanyLawTocEntry[] { const items: IGermanyLawTocEntry[] = []; const itemRegex = /\s*([\s\S]*?)<\/title>\s*<link>([\s\S]*?)<\/link>\s*<\/item>/g; for (const match of tocXmlArg.matchAll(itemRegex)) { const title = this.decodeHtmlEntities(match[1].trim()); const xmlZipUrl = match[2].trim().replace('http://', 'https://'); const slug = this.normalizeGermanySlug(xmlZipUrl); items.push({ title, xmlZipUrl, slug, }); } return items; } private normalizeGermanySlug(identifierArg: string) { return identifierArg .trim() .replace(/^https?:\/\/www\.gesetze-im-internet\.de\//, '') .replace(/\/xml\.zip$/i, '') .replace(/\/index\.html$/i, '') .replace(/^\/+/, '') .replace(/\/+$/, ''); } private async syncEuLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> { const language = (requestArg.language ?? 'EN').toUpperCase(); const offset = requestArg.offset ?? 0; const identifiers: string[] = []; const pageSize = 25; let currentOffset = offset; let remaining = requestArg.limit; while (remaining === undefined || remaining > 0) { const currentPageSize = remaining === undefined ? pageSize : Math.min(pageSize, remaining); const metadataPage = await this.fetchEuMetadataPage(currentPageSize, currentOffset, language); if (metadataPage.length === 0) { break; } for (const metadata of metadataPage) { const syncedLaw = await this.syncEuLaw(metadata.celex, language, metadata); identifiers.push(syncedLaw.identifier); } currentOffset += metadataPage.length; if (remaining !== undefined) { remaining -= metadataPage.length; } } return { jurisdiction: 'eu', syncedCount: identifiers.length, identifiers, }; } private async syncEuLaw( celexArg: string, languageArg = 'EN', metadataArg?: IEuLawMetadata ): Promise<LawRecord> { const language = languageArg.toUpperCase(); const pageUrl = `https://eur-lex.europa.eu/legal-content/${language}/TXT/?uri=CELEX:${celexArg}`; const euPageContent = await this.fetchEuLawPageContent(pageUrl); const metadata = metadataArg ?? { celex: celexArg, title: euPageContent.title, type: euPageContent.type, dateIssued: euPageContent.dateIssued, eli: euPageContent.eli, lastModified: '', language, }; return this.upsertLaw({ jurisdiction: 'eu', source: 'eur-lex', identifier: metadata.celex, title: metadata.title || euPageContent.title || this.firstMeaningfulLine(euPageContent.text) || metadata.celex, citation: metadata.eli || euPageContent.eli || metadata.celex, type: metadata.type || euPageContent.type, language: metadata.language.toLowerCase(), sourceUrl: pageUrl, rawFormat: 'html', rawBody: euPageContent.html, text: euPageContent.text, dateIssued: metadata.dateIssued || euPageContent.dateIssued, lastModified: metadata.lastModified, sourceMeta: { celex: metadata.celex, eli: metadata.eli || euPageContent.eli, }, }); } private async fetchEuLawPageContent(pageUrlArg: string): Promise<IEuPageContent> { const smartBrowser = await this.ensureBrowser(); const page = await smartBrowser.headlessBrowser.newPage(); try { await page.goto(pageUrlArg, { waitUntil: 'domcontentloaded', timeout: 60000, }); await page.waitForSelector('#text', { timeout: 60000, }); const pageContent = await page.evaluate(() => { const textNode = document.querySelector('#text') as HTMLElement | null; const titleParts = Array.from(document.querySelectorAll('#document1 p.oj-doc-ti')) .map((elementArg) => elementArg.textContent?.trim() || '') .filter((itemArg) => itemArg.length > 0); const dateIssued = (document.querySelector('#document1 p.oj-hd-date') as HTMLElement | null)?.innerText?.trim() || ''; const eli = (document.querySelector('a[href*="data.europa.eu/eli"]') as HTMLAnchorElement | null)?.href || ''; if (!textNode) { return { html: document.body.innerHTML, text: document.body.innerText, title: titleParts.join(' '), dateIssued, type: titleParts[0] || '', eli, }; } return { html: textNode.innerHTML, text: textNode.innerText, title: titleParts.join(' '), dateIssued, type: titleParts[0] || '', eli, }; }); if (!pageContent.text.trim()) { throw new Error(`EUR-Lex returned empty text for ${pageUrlArg}`); } return pageContent; } finally { await page.close().catch(() => {}); } } private async fetchEuMetadataPage( limitArg: number, offsetArg: number, languageArg: string ): Promise<IEuLawMetadata[]> { const sparqlQuery = ` PREFIX cdm: <http://publications.europa.eu/ontology/cdm#> PREFIX resource-type: <http://publications.europa.eu/resource/authority/resource-type/> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX cmr: <http://publications.europa.eu/ontology/cdm/cmr#> SELECT ?celex ?title ?date ?type_code (SAMPLE(?eli_candidate) AS ?eli) (MAX(?modified_candidate) AS ?modified_at) WHERE { VALUES ?type { resource-type:REG resource-type:DIR resource-type:DEC } ?work a cdm:resource_legal ; cdm:resource_legal_id_celex ?celex ; cdm:work_title ?title ; cdm:work_date_document ?date ; cdm:work_has_resource-type ?type . FILTER(LANG(?title) = "${languageArg.toLowerCase()}") BIND(REPLACE(STR(?type), "^.*/resource-type/", "") AS ?type_code) OPTIONAL { ?work owl:sameAs ?eli_candidate . FILTER(CONTAINS(STR(?eli_candidate), "/resource/eli/")) } OPTIONAL { ?work cmr:lastModificationDate ?modified_candidate . } } GROUP BY ?celex ?title ?date ?type_code ORDER BY DESC(?date) DESC(?celex) LIMIT ${limitArg} OFFSET ${offsetArg} `.trim(); return this.fetchEuMetadataFromQuery(sparqlQuery, languageArg); } private async fetchEuMetadataByCelex( celexArg: string, languageArg: string ): Promise<IEuLawMetadata> { const sparqlQuery = ` PREFIX cdm: <http://publications.europa.eu/ontology/cdm#> PREFIX resource-type: <http://publications.europa.eu/resource/authority/resource-type/> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX cmr: <http://publications.europa.eu/ontology/cdm/cmr#> SELECT ?celex ?title ?date ?type_code (SAMPLE(?eli_candidate) AS ?eli) (MAX(?modified_candidate) AS ?modified_at) WHERE { VALUES ?celex { "${celexArg}" } ?work a cdm:resource_legal ; cdm:resource_legal_id_celex ?celex ; cdm:work_title ?title ; cdm:work_date_document ?date ; cdm:work_has_resource-type ?type . FILTER(LANG(?title) = "${languageArg.toLowerCase()}") BIND(REPLACE(STR(?type), "^.*/resource-type/", "") AS ?type_code) OPTIONAL { ?work owl:sameAs ?eli_candidate . FILTER(CONTAINS(STR(?eli_candidate), "/resource/eli/")) } OPTIONAL { ?work cmr:lastModificationDate ?modified_candidate . } } GROUP BY ?celex ?title ?date ?type_code LIMIT 1 `.trim(); const results = await this.fetchEuMetadataFromQuery(sparqlQuery, languageArg); if (!results[0]) { throw new Error(`No EUR-Lex metadata found for CELEX ${celexArg}`); } return results[0]; } private async fetchEuMetadataFromQuery( queryArg: string, languageArg: string ): Promise<IEuLawMetadata[]> { const searchParams = new URLSearchParams({ query: queryArg, format: 'application/sparql-results+json', }); const response = await fetch(`https://publications.europa.eu/webapi/rdf/sparql?${searchParams}`, { headers: { 'Accept': 'application/sparql-results+json', }, }); if (!response.ok) { throw new Error(`Failed to query EU metadata: ${response.status} ${response.statusText}`); } const responseData = await response.json() as ISparqlJsonResult; const bindings = responseData.results?.bindings ?? []; return bindings.map((bindingArg) => ({ celex: bindingArg.celex?.value ?? '', title: bindingArg.title?.value ?? '', type: bindingArg.type_code?.value ?? '', dateIssued: bindingArg.date?.value ?? '', eli: bindingArg.eli?.value ?? '', lastModified: bindingArg.modified_at?.value ?? '', language: languageArg, })); } private async syncUsLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> { const collection = requestArg.usCollection ?? 'PLAW'; const apiKey = requestArg.govInfoApiKey ?? this.config.govInfoApiKey; const pageSize = 25; let remaining = requestArg.limit; const identifiers: string[] = []; let nextPageUrl = this.buildGovInfoCollectionUrl( collection, requestArg.since ?? new Date('1900-01-01T00:00:00.000Z'), pageSize, requestArg.offset ? String(requestArg.offset) : '*', apiKey ); while (nextPageUrl && (remaining === undefined || remaining > 0)) { const collectionResponse = await this.fetchJson<IGovInfoCollectionResponse>(nextPageUrl); const packageEntries = collectionResponse.packages ?? []; if (packageEntries.length === 0) { break; } const currentBatch = remaining === undefined ? packageEntries : packageEntries.slice(0, remaining); for (const packageEntry of currentBatch) { const syncedLaw = await this.syncUsLaw(packageEntry.packageId, collection, apiKey); identifiers.push(syncedLaw.identifier); } if (remaining !== undefined) { remaining -= currentBatch.length; } nextPageUrl = collectionResponse.nextPage ? this.withGovInfoApiKey(collectionResponse.nextPage, apiKey) : ''; } return { jurisdiction: 'us', syncedCount: identifiers.length, identifiers, }; } private async syncUsLaw( packageIdArg: string, collectionArg?: TUsLawCollection, apiKeyArg?: string ): Promise<LawRecord> { const usCodeCitation = this.parseUsCodeCitation(packageIdArg); if (usCodeCitation) { return this.syncUsCodeCitation(usCodeCitation); } const packageCollection = collectionArg ?? (packageIdArg.startsWith('USCODE-') ? 'USCODE' : 'PLAW'); const directContentLaw = await this.trySyncUsLawFromPublicContent(packageIdArg, packageCollection); if (directContentLaw) { return directContentLaw; } const apiKey = apiKeyArg ?? this.config.govInfoApiKey; const summary = await this.fetchJson<IGovInfoSummary>( this.withGovInfoApiKey( `https://api.govinfo.gov/packages/${encodeURIComponent(packageIdArg)}/summary`, apiKey ) ); const collection = (packageCollection ?? summary.collectionCode ?? 'PLAW') as TUsLawCollection; const rawUrl = collection === 'PLAW' ? summary.download?.uslmLink ?? summary.download?.txtLink : summary.download?.txtLink ?? summary.download?.uslmLink; if (!rawUrl) { throw new Error(`No downloadable GovInfo text found for ${packageIdArg}`); } const rawFormat: TRawLawFormat = rawUrl.includes('/uslm') ? 'xml' : 'text'; const rawBody = await this.fetchText(this.withGovInfoApiKey(rawUrl, apiKey)); const citation = collection === 'USCODE' ? `Title ${summary.titleNumber ?? ''} U.S. Code`.trim() : `Public Law ${summary.congress ?? ''}-${summary.documentNumber ?? ''}`.trim(); const source = collection === 'USCODE' ? 'govinfo-uscode' : 'govinfo-plaw'; return this.upsertLaw({ jurisdiction: 'us', source, identifier: packageIdArg, title: summary.title ?? packageIdArg, shortTitle: summary.shortTitle?.[0]?.title ?? '', citation, type: summary.documentType ?? collection, language: 'en', sourceUrl: summary.detailsLink ?? rawUrl, rawFormat, rawBody, text: rawFormat === 'xml' ? this.markupToText(rawBody) : this.decodeHtmlEntities(rawBody), dateIssued: summary.dateIssued, lastModified: summary.lastModified, sourceMeta: { packageId: packageIdArg, collection, }, }); } private async syncUsCodeCitation(citationArg: IUsCodeCitation): Promise<LawRecord> { const rawBody = await this.fetchText(citationArg.sourceUrl); const pageTitle = this.extractFirstMatch( rawBody, /<h1[^>]*id="page_title"[^>]*>([\s\S]*?)<\/h1>/i ); const sectionMarkup = this.extractFirstMatch( rawBody, /<div class="tab-pane active" id="tab_default_1">([\s\S]*?)<\/div>\s*<div class="tab-pane" id="tab_default_2">/i ) || rawBody; const resolvedTitle = this.extractUsLiiHeading(pageTitle) || citationArg.canonicalCitation; return this.upsertLaw({ jurisdiction: 'us', source: 'law-cornell-lii', identifier: citationArg.canonicalIdentifier, title: resolvedTitle, shortTitle: citationArg.canonicalCitation, citation: citationArg.canonicalCitation, type: 'USCODE_SECTION', language: 'en', sourceUrl: citationArg.sourceUrl, rawFormat: 'html', rawBody: sectionMarkup, text: this.markupToText(sectionMarkup), sourceMeta: { titleNumber: citationArg.titleNumber, sectionNumber: citationArg.sectionNumber, subsectionPath: citationArg.subsectionPath, }, }); } private async trySyncUsLawFromPublicContent( packageIdArg: string, collectionArg: TUsLawCollection ): Promise<LawRecord | null> { const rawUrl = collectionArg === 'USCODE' ? `https://www.govinfo.gov/content/pkg/${packageIdArg}/html/${packageIdArg}.htm` : `https://www.govinfo.gov/content/pkg/${packageIdArg}/uslm/${packageIdArg}.xml`; try { const rawBody = await this.fetchText(rawUrl); const citationValues = this.extractTagValues(rawBody, 'citableAs'); const title = this.extractTagValue(rawBody, 'dc:title') .replace(/^Public Law\s+[^:]+:\s*/i, '') .trim() || packageIdArg; const shortTitle = this.extractTagValue(rawBody, 'shortTitle'); const dateIssued = this.extractTagValue(rawBody, 'approvedDate') || this.extractTagValue(rawBody, 'dc:date'); const type = this.extractTagValue(rawBody, 'dc:type') || collectionArg; return this.upsertLaw({ jurisdiction: 'us', source: collectionArg === 'USCODE' ? 'govinfo-uscode' : 'govinfo-plaw', identifier: packageIdArg, title, shortTitle, citation: citationValues[0] || packageIdArg, type, language: 'en', sourceUrl: rawUrl, rawFormat: rawUrl.endsWith('.xml') ? 'xml' : 'html', rawBody, text: this.markupToText(rawBody), dateIssued, sourceMeta: { packageId: packageIdArg, collection: collectionArg, }, }); } catch { return null; } } private parseUsCodeCitation(identifierArg: string): IUsCodeCitation | null { const cleanedIdentifier = identifierArg .trim() .replace(/\u00a0/g, ' ') .replace(/\s+/g, ' ') .replace(/§+/g, ' '); if (!cleanedIdentifier || /^(PLAW|USCODE)-/i.test(cleanedIdentifier)) { return null; } const citationMatch = cleanedIdentifier.match( /^(\d+[A-Za-z]*)\s*(?:U\.?\s*S\.?\s*C\.?|USC(?:ODE)?)\s+([0-9A-Za-z.-]+)((?:\([A-Za-z0-9]+\))*)$/i ); if (!citationMatch) { return null; } const titleNumber = citationMatch[1].toUpperCase(); const sectionNumber = citationMatch[2]; const subsectionPath = citationMatch[3] ?? ''; const canonicalCitation = `${titleNumber} USC ${sectionNumber}`; return { titleNumber, sectionNumber, subsectionPath, canonicalIdentifier: canonicalCitation, canonicalCitation, sourceUrl: `https://www.law.cornell.edu/uscode/text/${encodeURIComponent(titleNumber)}/${encodeURIComponent(sectionNumber)}`, }; } private buildGovInfoCollectionUrl( collectionArg: TUsLawCollection, sinceArg: Date, pageSizeArg: number, offsetMarkArg: string, apiKeyArg: string ) { const sinceIso = sinceArg.toISOString().replace(/\.\d{3}Z$/, 'Z'); return this.withGovInfoApiKey( `https://api.govinfo.gov/collections/${collectionArg}/${sinceIso}?pageSize=${pageSizeArg}&offsetMark=${encodeURIComponent(offsetMarkArg)}`, apiKeyArg ); } private withGovInfoApiKey(urlArg: string, apiKeyArg: string) { const url = new URL(urlArg); url.searchParams.set('api_key', apiKeyArg); return url.toString(); } private async fetchText(urlArg: string) { const response = await fetch(urlArg, { headers: { 'User-Agent': '@fin.cx/opendata laws sync', }, }); if (!response.ok) { throw new Error(`Failed to fetch ${urlArg}: ${response.status} ${response.statusText}`); } return response.text(); } private async fetchJson<TResponse>(urlArg: string): Promise<TResponse> { const response = await fetch(urlArg, { headers: { 'User-Agent': '@fin.cx/opendata laws sync', 'Accept': 'application/json', }, }); if (!response.ok) { throw new Error(`Failed to fetch ${urlArg}: ${response.status} ${response.statusText}`); } return response.json() as Promise<TResponse>; } private extractTagValue(xmlArg: string, tagNameArg: string) { const tagRegex = new RegExp(`<${tagNameArg}[^>]*>([\\s\\S]*?)<\/${tagNameArg}>`, 'i'); const match = xmlArg.match(tagRegex); return match ? this.decodeHtmlEntities(match[1].trim()) : ''; } private extractTagValues(xmlArg: string, tagNameArg: string) { const tagRegex = new RegExp(`<${tagNameArg}[^>]*>([\\s\\S]*?)<\/${tagNameArg}>`, 'gi'); const matches = Array.from(xmlArg.matchAll(tagRegex)); return matches.map((matchArg) => this.decodeHtmlEntities(matchArg[1].trim())); } private extractFirstMatch(textArg: string, regexArg: RegExp) { const match = textArg.match(regexArg); return match ? this.decodeHtmlEntities(match[1].trim()) : ''; } private extractUsLiiHeading(pageTitleArg: string) { const normalizedTitle = pageTitleArg.replace(/\s+/g, ' ').trim(); const headingMatch = normalizedTitle.match(/^\d+\s+U\.?S\.? Code\s*§\s*[^-]+-\s*(.+)$/i); return headingMatch ? headingMatch[1].trim() : normalizedTitle; } private markupToText(markupArg: string) { return this.decodeHtmlEntities( markupArg .replace(/<\/?(BR|br)\s*\/?>/g, '\n') .replace(/<\/(P|p|div|section|article|li|tr|h1|h2|h3|h4|h5|h6)>/g, '\n') .replace(/<[^>]+>/g, ' ') .replace(/\r/g, '') .replace(/[ \t]+\n/g, '\n') .replace(/\n{3,}/g, '\n\n') .replace(/[ \t]{2,}/g, ' ') .trim() ); } private decodeHtmlEntities(textArg: string) { const entityMap: Record<string, string> = { '&': '&', '<': '<', '>': '>', '"': '"', ''': "'", ' ': ' ', }; return textArg .replace(/&(amp|lt|gt|quot|apos|nbsp);/g, (matchArg) => entityMap[matchArg] ?? matchArg) .replace(/&#(\d+);/g, (_matchArg, valueArg: string) => String.fromCodePoint(Number.parseInt(valueArg, 10)) ) .replace(/&#x([0-9a-f]+);/gi, (_matchArg, valueArg: string) => String.fromCodePoint(Number.parseInt(valueArg, 16)) ); } private escapeRegex(valueArg: string) { return valueArg.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } private firstMeaningfulLine(textArg: string) { return textArg .split('\n') .map((lineArg) => lineArg.trim()) .find((lineArg) => lineArg.length > 20) ?? ''; } }