Files
opendata/ts/laws/classes.lawservice.ts
T

1011 lines
30 KiB
TypeScript

import * as plugins from '../plugins.js';
import * as paths from '../paths.js';
import { LawRecord } from './classes.lawrecord.js';
import type {
ILawLookupRequest,
ILawSearchRequest,
ILawServiceConfig,
ILawSyncRequest,
ILawSyncResult,
TJurisdiction,
TLawSource,
TRawLawFormat,
TUsLawCollection,
} from './interfaces.law.js';
interface IStoredLawData {
jurisdiction: TJurisdiction;
source: TLawSource;
identifier: string;
title: string;
shortTitle?: string;
citation?: string;
type?: string;
language?: string;
sourceUrl: string;
rawFormat: TRawLawFormat;
rawBody: string;
text: string;
dateIssued?: string;
lastModified?: string;
sourceMeta?: Record<string, string>;
}
interface IGermanyLawTocEntry {
title: string;
xmlZipUrl: string;
slug: string;
}
interface IEuLawMetadata {
celex: string;
title: string;
type: string;
dateIssued: string;
eli: string;
lastModified: string;
language: string;
}
interface ISparqlJsonResult {
results?: {
bindings?: Array<Record<string, { value: string }>>;
};
}
interface IGovInfoCollectionResponse {
nextPage?: string;
packages?: Array<{
packageId: string;
}>;
}
interface IGovInfoSummary {
packageId: string;
collectionCode: string;
title?: string;
shortTitle?: Array<{
title: string;
}>;
detailsLink?: string;
dateIssued?: string;
lastModified?: string;
titleNumber?: string;
congress?: string;
documentNumber?: string;
documentType?: string;
download?: {
txtLink?: string;
uslmLink?: string;
};
}
interface IEuPageContent {
html: string;
text: string;
title: string;
dateIssued: string;
type: string;
eli: string;
}
interface IUsCodeCitation {
titleNumber: string;
sectionNumber: string;
subsectionPath: string;
canonicalIdentifier: string;
canonicalCitation: string;
sourceUrl: string;
}
export class LawService {
public db!: plugins.smartdata.SmartdataDb;
public CLawRecord = plugins.smartdata.setDefaultManagerForDoc(this, LawRecord);
private localSmartDb?: plugins.smartdb.LocalSmartDb;
private smartBrowser?: plugins.smartbrowser.SmartBrowser;
private browserStarted = false;
private started = false;
private config: Required<Pick<ILawServiceConfig, 'dbFolderPath' | 'dbName' | 'govInfoApiKey'>>;
constructor(configArg: ILawServiceConfig = {}) {
this.config = {
dbFolderPath:
configArg.dbFolderPath ??
plugins.path.join(paths.packageDir, '.nogit', 'law-smartdb'),
dbName: configArg.dbName ?? 'laws',
govInfoApiKey: configArg.govInfoApiKey ?? 'DEMO_KEY',
};
}
public async start() {
if (this.started) {
return;
}
await plugins.smartfs.directory(this.config.dbFolderPath).create();
this.localSmartDb = new plugins.smartdb.LocalSmartDb({
folderPath: this.config.dbFolderPath,
});
const connectionInfo = await this.localSmartDb.start();
this.db = new plugins.smartdata.SmartdataDb({
mongoDbUrl: connectionInfo.connectionUri,
mongoDbName: this.config.dbName,
});
await this.db.init();
await this.db.mongoDb.collection('_opendata_bootstrap').insertOne({
createdAt: new Date(),
});
await this.db.mongoDb.collection('_opendata_bootstrap').deleteMany({});
this.started = true;
}
public async stop() {
if (this.browserStarted && this.smartBrowser) {
await this.smartBrowser.stop();
this.browserStarted = false;
}
if (this.started) {
await this.db.close();
await this.localSmartDb?.stop();
this.started = false;
}
}
public async syncLaw(requestArg: ILawLookupRequest): Promise<LawRecord> {
await this.ensureStarted();
switch (requestArg.jurisdiction) {
case 'de':
return this.syncGermanyLaw(requestArg.identifier);
case 'eu':
return this.syncEuLaw(requestArg.identifier, requestArg.language ?? 'EN');
case 'us':
return this.syncUsLaw(
requestArg.identifier,
requestArg.usCollection,
this.config.govInfoApiKey
);
default:
throw new Error(`Unsupported jurisdiction: ${requestArg.jurisdiction}`);
}
}
public async getLaw(requestArg: ILawLookupRequest): Promise<LawRecord | null> {
await this.ensureStarted();
const lookupKey = this.createLookupKey(requestArg.jurisdiction, requestArg.identifier);
const existingLaw = await this.getLawByLookupKey(lookupKey);
if (existingLaw && !requestArg.forceSync) {
return existingLaw;
}
return this.syncLaw(requestArg);
}
public async searchLaws(requestArg: ILawSearchRequest): Promise<LawRecord[]> {
await this.ensureStarted();
const limit = requestArg.limit ?? 20;
const baseFilter = requestArg.jurisdiction
? ({ jurisdiction: requestArg.jurisdiction } as Record<string, unknown>)
: undefined;
try {
const results = await this.CLawRecord.search(
requestArg.query,
baseFilter ? { filter: baseFilter } : undefined
);
return results.slice(0, limit);
} catch {
const regex = new RegExp(this.escapeRegex(requestArg.query), 'i');
const searchFilter = {
$or: [
{ identifier: { $regex: regex } },
{ title: { $regex: regex } },
{ shortTitle: { $regex: regex } },
{ citation: { $regex: regex } },
{ text: { $regex: regex } },
],
};
const mongoFilter = baseFilter
? {
$and: [baseFilter, searchFilter],
}
: searchFilter;
const results = await this.CLawRecord.getInstances(mongoFilter as any);
return results.slice(0, limit);
}
}
public async syncLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
await this.ensureStarted();
switch (requestArg.jurisdiction) {
case 'de':
return this.syncGermanyLaws(requestArg);
case 'eu':
return this.syncEuLaws(requestArg);
case 'us':
return this.syncUsLaws(requestArg);
default:
throw new Error(`Unsupported jurisdiction: ${requestArg.jurisdiction}`);
}
}
private async ensureStarted() {
if (!this.started) {
await this.start();
}
}
private async ensureBrowser() {
if (!this.smartBrowser) {
this.smartBrowser = new plugins.smartbrowser.SmartBrowser();
}
if (!this.browserStarted) {
await this.smartBrowser.start();
this.browserStarted = true;
}
return this.smartBrowser;
}
private createLookupKey(jurisdictionArg: TJurisdiction, identifierArg: string) {
return `${jurisdictionArg}:${this.normalizeIdentifierForLookup(jurisdictionArg, identifierArg)}`;
}
private normalizeIdentifierForLookup(jurisdictionArg: TJurisdiction, identifierArg: string) {
switch (jurisdictionArg) {
case 'de':
return this.normalizeGermanySlug(identifierArg);
case 'eu':
return identifierArg.trim().replace(/^CELEX:/i, '').toUpperCase();
case 'us': {
const usCodeCitation = this.parseUsCodeCitation(identifierArg);
return usCodeCitation
? usCodeCitation.canonicalIdentifier
: identifierArg.trim().toUpperCase();
}
default:
return identifierArg.trim();
}
}
private async getLawByLookupKey(lookupKeyArg: string) {
return LawRecord.getByLookupKey(lookupKeyArg);
}
private async upsertLaw(dataArg: IStoredLawData): Promise<LawRecord> {
const lookupKey = this.createLookupKey(dataArg.jurisdiction, dataArg.identifier);
let lawRecord = await this.getLawByLookupKey(lookupKey);
if (!lawRecord) {
lawRecord = new this.CLawRecord();
lawRecord.id = await this.CLawRecord.getNewId();
lawRecord.lookupKey = lookupKey;
}
lawRecord.jurisdiction = dataArg.jurisdiction;
lawRecord.source = dataArg.source;
lawRecord.identifier = dataArg.identifier;
lawRecord.title = dataArg.title;
lawRecord.shortTitle = dataArg.shortTitle ?? '';
lawRecord.citation = dataArg.citation ?? '';
lawRecord.type = dataArg.type ?? '';
lawRecord.language = dataArg.language ?? '';
lawRecord.sourceUrl = dataArg.sourceUrl;
lawRecord.rawFormat = dataArg.rawFormat;
lawRecord.rawBody = dataArg.rawBody;
lawRecord.text = dataArg.text;
lawRecord.dateIssued = dataArg.dateIssued ?? '';
lawRecord.lastModified = dataArg.lastModified ?? '';
lawRecord.sourceMeta = dataArg.sourceMeta ?? {};
lawRecord.fetchedAt = new Date();
lawRecord.syncedAt = new Date();
await lawRecord.save();
return lawRecord;
}
private async syncGermanyLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
const tocXml = await this.fetchText('https://www.gesetze-im-internet.de/gii-toc.xml');
const tocEntries = this.parseGermanyToc(tocXml);
const offset = requestArg.offset ?? 0;
const targetEntries = typeof requestArg.limit === 'number'
? tocEntries.slice(offset, offset + requestArg.limit)
: tocEntries.slice(offset);
const identifiers: string[] = [];
for (const entry of targetEntries) {
const syncedLaw = await this.syncGermanyLaw(entry);
identifiers.push(syncedLaw.identifier);
}
return {
jurisdiction: 'de',
syncedCount: identifiers.length,
identifiers,
};
}
private async syncGermanyLaw(identifierOrEntryArg: string | IGermanyLawTocEntry): Promise<LawRecord> {
const germanyEntry =
typeof identifierOrEntryArg === 'string'
? {
slug: this.normalizeGermanySlug(identifierOrEntryArg),
title: '',
xmlZipUrl: `https://www.gesetze-im-internet.de/${this.normalizeGermanySlug(identifierOrEntryArg)}/xml.zip`,
}
: identifierOrEntryArg;
const extractedFiles = await plugins.smartarchive.SmartArchive.create()
.url(germanyEntry.xmlZipUrl)
.toSmartFiles();
const xmlFile =
extractedFiles.find((fileArg: plugins.smartfile.SmartFile) =>
fileArg.relative.toLowerCase().endsWith('.xml')
) ?? extractedFiles[0];
if (!xmlFile) {
throw new Error(`No XML file found for German law ${germanyEntry.slug}`);
}
const xmlBody = xmlFile.parseContentAsString('utf8');
const title = this.extractTagValue(xmlBody, 'langue') || germanyEntry.title || germanyEntry.slug;
const citation = this.extractTagValue(xmlBody, 'jurabk') || germanyEntry.slug.toUpperCase();
const dateIssued = this.extractTagValue(xmlBody, 'ausfertigung-datum');
return this.upsertLaw({
jurisdiction: 'de',
source: 'gesetze-im-internet',
identifier: germanyEntry.slug,
title,
shortTitle: citation,
citation,
type: 'law',
language: 'de',
sourceUrl: germanyEntry.xmlZipUrl,
rawFormat: 'xml',
rawBody: xmlBody,
text: this.markupToText(xmlBody),
dateIssued,
sourceMeta: {
slug: germanyEntry.slug,
},
});
}
private parseGermanyToc(tocXmlArg: string): IGermanyLawTocEntry[] {
const items: IGermanyLawTocEntry[] = [];
const itemRegex = /<item>\s*<title>([\s\S]*?)<\/title>\s*<link>([\s\S]*?)<\/link>\s*<\/item>/g;
for (const match of tocXmlArg.matchAll(itemRegex)) {
const title = this.decodeHtmlEntities(match[1].trim());
const xmlZipUrl = match[2].trim().replace('http://', 'https://');
const slug = this.normalizeGermanySlug(xmlZipUrl);
items.push({
title,
xmlZipUrl,
slug,
});
}
return items;
}
private normalizeGermanySlug(identifierArg: string) {
return identifierArg
.trim()
.replace(/^https?:\/\/www\.gesetze-im-internet\.de\//, '')
.replace(/\/xml\.zip$/i, '')
.replace(/\/index\.html$/i, '')
.replace(/^\/+/, '')
.replace(/\/+$/, '');
}
private async syncEuLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
const language = (requestArg.language ?? 'EN').toUpperCase();
const offset = requestArg.offset ?? 0;
const identifiers: string[] = [];
const pageSize = 25;
let currentOffset = offset;
let remaining = requestArg.limit;
while (remaining === undefined || remaining > 0) {
const currentPageSize = remaining === undefined ? pageSize : Math.min(pageSize, remaining);
const metadataPage = await this.fetchEuMetadataPage(currentPageSize, currentOffset, language);
if (metadataPage.length === 0) {
break;
}
for (const metadata of metadataPage) {
const syncedLaw = await this.syncEuLaw(metadata.celex, language, metadata);
identifiers.push(syncedLaw.identifier);
}
currentOffset += metadataPage.length;
if (remaining !== undefined) {
remaining -= metadataPage.length;
}
}
return {
jurisdiction: 'eu',
syncedCount: identifiers.length,
identifiers,
};
}
private async syncEuLaw(
celexArg: string,
languageArg = 'EN',
metadataArg?: IEuLawMetadata
): Promise<LawRecord> {
const language = languageArg.toUpperCase();
const pageUrl = `https://eur-lex.europa.eu/legal-content/${language}/TXT/?uri=CELEX:${celexArg}`;
const euPageContent = await this.fetchEuLawPageContent(pageUrl);
const metadata = metadataArg ?? {
celex: celexArg,
title: euPageContent.title,
type: euPageContent.type,
dateIssued: euPageContent.dateIssued,
eli: euPageContent.eli,
lastModified: '',
language,
};
return this.upsertLaw({
jurisdiction: 'eu',
source: 'eur-lex',
identifier: metadata.celex,
title: metadata.title || euPageContent.title || this.firstMeaningfulLine(euPageContent.text) || metadata.celex,
citation: metadata.eli || euPageContent.eli || metadata.celex,
type: metadata.type || euPageContent.type,
language: metadata.language.toLowerCase(),
sourceUrl: pageUrl,
rawFormat: 'html',
rawBody: euPageContent.html,
text: euPageContent.text,
dateIssued: metadata.dateIssued || euPageContent.dateIssued,
lastModified: metadata.lastModified,
sourceMeta: {
celex: metadata.celex,
eli: metadata.eli || euPageContent.eli,
},
});
}
private async fetchEuLawPageContent(pageUrlArg: string): Promise<IEuPageContent> {
const smartBrowser = await this.ensureBrowser();
const page = await smartBrowser.headlessBrowser.newPage();
try {
await page.goto(pageUrlArg, {
waitUntil: 'domcontentloaded',
timeout: 60000,
});
await page.waitForSelector('#text', {
timeout: 60000,
});
const pageContent = await page.evaluate(() => {
const textNode = document.querySelector('#text') as HTMLElement | null;
const titleParts = Array.from(document.querySelectorAll('#document1 p.oj-doc-ti'))
.map((elementArg) => elementArg.textContent?.trim() || '')
.filter((itemArg) => itemArg.length > 0);
const dateIssued =
(document.querySelector('#document1 p.oj-hd-date') as HTMLElement | null)?.innerText?.trim() ||
'';
const eli =
(document.querySelector('a[href*="data.europa.eu/eli"]') as HTMLAnchorElement | null)?.href ||
'';
if (!textNode) {
return {
html: document.body.innerHTML,
text: document.body.innerText,
title: titleParts.join(' '),
dateIssued,
type: titleParts[0] || '',
eli,
};
}
return {
html: textNode.innerHTML,
text: textNode.innerText,
title: titleParts.join(' '),
dateIssued,
type: titleParts[0] || '',
eli,
};
});
if (!pageContent.text.trim()) {
throw new Error(`EUR-Lex returned empty text for ${pageUrlArg}`);
}
return pageContent;
} finally {
await page.close().catch(() => {});
}
}
private async fetchEuMetadataPage(
limitArg: number,
offsetArg: number,
languageArg: string
): Promise<IEuLawMetadata[]> {
const sparqlQuery = `
PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
PREFIX resource-type: <http://publications.europa.eu/resource/authority/resource-type/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX cmr: <http://publications.europa.eu/ontology/cdm/cmr#>
SELECT
?celex
?title
?date
?type_code
(SAMPLE(?eli_candidate) AS ?eli)
(MAX(?modified_candidate) AS ?modified_at)
WHERE {
VALUES ?type {
resource-type:REG
resource-type:DIR
resource-type:DEC
}
?work a cdm:resource_legal ;
cdm:resource_legal_id_celex ?celex ;
cdm:work_title ?title ;
cdm:work_date_document ?date ;
cdm:work_has_resource-type ?type .
FILTER(LANG(?title) = "${languageArg.toLowerCase()}")
BIND(REPLACE(STR(?type), "^.*/resource-type/", "") AS ?type_code)
OPTIONAL {
?work owl:sameAs ?eli_candidate .
FILTER(CONTAINS(STR(?eli_candidate), "/resource/eli/"))
}
OPTIONAL {
?work cmr:lastModificationDate ?modified_candidate .
}
}
GROUP BY ?celex ?title ?date ?type_code
ORDER BY DESC(?date) DESC(?celex)
LIMIT ${limitArg}
OFFSET ${offsetArg}
`.trim();
return this.fetchEuMetadataFromQuery(sparqlQuery, languageArg);
}
private async fetchEuMetadataByCelex(
celexArg: string,
languageArg: string
): Promise<IEuLawMetadata> {
const sparqlQuery = `
PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
PREFIX resource-type: <http://publications.europa.eu/resource/authority/resource-type/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX cmr: <http://publications.europa.eu/ontology/cdm/cmr#>
SELECT
?celex
?title
?date
?type_code
(SAMPLE(?eli_candidate) AS ?eli)
(MAX(?modified_candidate) AS ?modified_at)
WHERE {
VALUES ?celex { "${celexArg}" }
?work a cdm:resource_legal ;
cdm:resource_legal_id_celex ?celex ;
cdm:work_title ?title ;
cdm:work_date_document ?date ;
cdm:work_has_resource-type ?type .
FILTER(LANG(?title) = "${languageArg.toLowerCase()}")
BIND(REPLACE(STR(?type), "^.*/resource-type/", "") AS ?type_code)
OPTIONAL {
?work owl:sameAs ?eli_candidate .
FILTER(CONTAINS(STR(?eli_candidate), "/resource/eli/"))
}
OPTIONAL {
?work cmr:lastModificationDate ?modified_candidate .
}
}
GROUP BY ?celex ?title ?date ?type_code
LIMIT 1
`.trim();
const results = await this.fetchEuMetadataFromQuery(sparqlQuery, languageArg);
if (!results[0]) {
throw new Error(`No EUR-Lex metadata found for CELEX ${celexArg}`);
}
return results[0];
}
private async fetchEuMetadataFromQuery(
queryArg: string,
languageArg: string
): Promise<IEuLawMetadata[]> {
const searchParams = new URLSearchParams({
query: queryArg,
format: 'application/sparql-results+json',
});
const response = await fetch(`https://publications.europa.eu/webapi/rdf/sparql?${searchParams}`, {
headers: {
'Accept': 'application/sparql-results+json',
},
});
if (!response.ok) {
throw new Error(`Failed to query EU metadata: ${response.status} ${response.statusText}`);
}
const responseData = await response.json() as ISparqlJsonResult;
const bindings = responseData.results?.bindings ?? [];
return bindings.map((bindingArg) => ({
celex: bindingArg.celex?.value ?? '',
title: bindingArg.title?.value ?? '',
type: bindingArg.type_code?.value ?? '',
dateIssued: bindingArg.date?.value ?? '',
eli: bindingArg.eli?.value ?? '',
lastModified: bindingArg.modified_at?.value ?? '',
language: languageArg,
}));
}
private async syncUsLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
const collection = requestArg.usCollection ?? 'PLAW';
const apiKey = requestArg.govInfoApiKey ?? this.config.govInfoApiKey;
const pageSize = 25;
let remaining = requestArg.limit;
const identifiers: string[] = [];
let nextPageUrl = this.buildGovInfoCollectionUrl(
collection,
requestArg.since ?? new Date('1900-01-01T00:00:00.000Z'),
pageSize,
requestArg.offset ? String(requestArg.offset) : '*',
apiKey
);
while (nextPageUrl && (remaining === undefined || remaining > 0)) {
const collectionResponse = await this.fetchJson<IGovInfoCollectionResponse>(nextPageUrl);
const packageEntries = collectionResponse.packages ?? [];
if (packageEntries.length === 0) {
break;
}
const currentBatch = remaining === undefined
? packageEntries
: packageEntries.slice(0, remaining);
for (const packageEntry of currentBatch) {
const syncedLaw = await this.syncUsLaw(packageEntry.packageId, collection, apiKey);
identifiers.push(syncedLaw.identifier);
}
if (remaining !== undefined) {
remaining -= currentBatch.length;
}
nextPageUrl = collectionResponse.nextPage
? this.withGovInfoApiKey(collectionResponse.nextPage, apiKey)
: '';
}
return {
jurisdiction: 'us',
syncedCount: identifiers.length,
identifiers,
};
}
private async syncUsLaw(
packageIdArg: string,
collectionArg?: TUsLawCollection,
apiKeyArg?: string
): Promise<LawRecord> {
const usCodeCitation = this.parseUsCodeCitation(packageIdArg);
if (usCodeCitation) {
return this.syncUsCodeCitation(usCodeCitation);
}
const packageCollection = collectionArg ?? (packageIdArg.startsWith('USCODE-') ? 'USCODE' : 'PLAW');
const directContentLaw = await this.trySyncUsLawFromPublicContent(packageIdArg, packageCollection);
if (directContentLaw) {
return directContentLaw;
}
const apiKey = apiKeyArg ?? this.config.govInfoApiKey;
const summary = await this.fetchJson<IGovInfoSummary>(
this.withGovInfoApiKey(
`https://api.govinfo.gov/packages/${encodeURIComponent(packageIdArg)}/summary`,
apiKey
)
);
const collection = (packageCollection ?? summary.collectionCode ?? 'PLAW') as TUsLawCollection;
const rawUrl = collection === 'PLAW'
? summary.download?.uslmLink ?? summary.download?.txtLink
: summary.download?.txtLink ?? summary.download?.uslmLink;
if (!rawUrl) {
throw new Error(`No downloadable GovInfo text found for ${packageIdArg}`);
}
const rawFormat: TRawLawFormat = rawUrl.includes('/uslm') ? 'xml' : 'text';
const rawBody = await this.fetchText(this.withGovInfoApiKey(rawUrl, apiKey));
const citation = collection === 'USCODE'
? `Title ${summary.titleNumber ?? ''} U.S. Code`.trim()
: `Public Law ${summary.congress ?? ''}-${summary.documentNumber ?? ''}`.trim();
const source = collection === 'USCODE' ? 'govinfo-uscode' : 'govinfo-plaw';
return this.upsertLaw({
jurisdiction: 'us',
source,
identifier: packageIdArg,
title: summary.title ?? packageIdArg,
shortTitle: summary.shortTitle?.[0]?.title ?? '',
citation,
type: summary.documentType ?? collection,
language: 'en',
sourceUrl: summary.detailsLink ?? rawUrl,
rawFormat,
rawBody,
text: rawFormat === 'xml' ? this.markupToText(rawBody) : this.decodeHtmlEntities(rawBody),
dateIssued: summary.dateIssued,
lastModified: summary.lastModified,
sourceMeta: {
packageId: packageIdArg,
collection,
},
});
}
private async syncUsCodeCitation(citationArg: IUsCodeCitation): Promise<LawRecord> {
const rawBody = await this.fetchText(citationArg.sourceUrl);
const pageTitle = this.extractFirstMatch(
rawBody,
/<h1[^>]*id="page_title"[^>]*>([\s\S]*?)<\/h1>/i
);
const sectionMarkup =
this.extractFirstMatch(
rawBody,
/<div class="tab-pane active" id="tab_default_1">([\s\S]*?)<\/div>\s*<div class="tab-pane" id="tab_default_2">/i
) || rawBody;
const resolvedTitle = this.extractUsLiiHeading(pageTitle) || citationArg.canonicalCitation;
return this.upsertLaw({
jurisdiction: 'us',
source: 'law-cornell-lii',
identifier: citationArg.canonicalIdentifier,
title: resolvedTitle,
shortTitle: citationArg.canonicalCitation,
citation: citationArg.canonicalCitation,
type: 'USCODE_SECTION',
language: 'en',
sourceUrl: citationArg.sourceUrl,
rawFormat: 'html',
rawBody: sectionMarkup,
text: this.markupToText(sectionMarkup),
sourceMeta: {
titleNumber: citationArg.titleNumber,
sectionNumber: citationArg.sectionNumber,
subsectionPath: citationArg.subsectionPath,
},
});
}
private async trySyncUsLawFromPublicContent(
packageIdArg: string,
collectionArg: TUsLawCollection
): Promise<LawRecord | null> {
const rawUrl = collectionArg === 'USCODE'
? `https://www.govinfo.gov/content/pkg/${packageIdArg}/html/${packageIdArg}.htm`
: `https://www.govinfo.gov/content/pkg/${packageIdArg}/uslm/${packageIdArg}.xml`;
try {
const rawBody = await this.fetchText(rawUrl);
const citationValues = this.extractTagValues(rawBody, 'citableAs');
const title =
this.extractTagValue(rawBody, 'dc:title')
.replace(/^Public Law\s+[^:]+:\s*/i, '')
.trim() || packageIdArg;
const shortTitle = this.extractTagValue(rawBody, 'shortTitle');
const dateIssued =
this.extractTagValue(rawBody, 'approvedDate') || this.extractTagValue(rawBody, 'dc:date');
const type = this.extractTagValue(rawBody, 'dc:type') || collectionArg;
return this.upsertLaw({
jurisdiction: 'us',
source: collectionArg === 'USCODE' ? 'govinfo-uscode' : 'govinfo-plaw',
identifier: packageIdArg,
title,
shortTitle,
citation: citationValues[0] || packageIdArg,
type,
language: 'en',
sourceUrl: rawUrl,
rawFormat: rawUrl.endsWith('.xml') ? 'xml' : 'html',
rawBody,
text: this.markupToText(rawBody),
dateIssued,
sourceMeta: {
packageId: packageIdArg,
collection: collectionArg,
},
});
} catch {
return null;
}
}
private parseUsCodeCitation(identifierArg: string): IUsCodeCitation | null {
const cleanedIdentifier = identifierArg
.trim()
.replace(/\u00a0/g, ' ')
.replace(/\s+/g, ' ')
.replace(/§+/g, ' ');
if (!cleanedIdentifier || /^(PLAW|USCODE)-/i.test(cleanedIdentifier)) {
return null;
}
const citationMatch = cleanedIdentifier.match(
/^(\d+[A-Za-z]*)\s*(?:U\.?\s*S\.?\s*C\.?|USC(?:ODE)?)\s+([0-9A-Za-z.-]+)((?:\([A-Za-z0-9]+\))*)$/i
);
if (!citationMatch) {
return null;
}
const titleNumber = citationMatch[1].toUpperCase();
const sectionNumber = citationMatch[2];
const subsectionPath = citationMatch[3] ?? '';
const canonicalCitation = `${titleNumber} USC ${sectionNumber}`;
return {
titleNumber,
sectionNumber,
subsectionPath,
canonicalIdentifier: canonicalCitation,
canonicalCitation,
sourceUrl: `https://www.law.cornell.edu/uscode/text/${encodeURIComponent(titleNumber)}/${encodeURIComponent(sectionNumber)}`,
};
}
private buildGovInfoCollectionUrl(
collectionArg: TUsLawCollection,
sinceArg: Date,
pageSizeArg: number,
offsetMarkArg: string,
apiKeyArg: string
) {
const sinceIso = sinceArg.toISOString().replace(/\.\d{3}Z$/, 'Z');
return this.withGovInfoApiKey(
`https://api.govinfo.gov/collections/${collectionArg}/${sinceIso}?pageSize=${pageSizeArg}&offsetMark=${encodeURIComponent(offsetMarkArg)}`,
apiKeyArg
);
}
private withGovInfoApiKey(urlArg: string, apiKeyArg: string) {
const url = new URL(urlArg);
url.searchParams.set('api_key', apiKeyArg);
return url.toString();
}
private async fetchText(urlArg: string) {
const response = await fetch(urlArg, {
headers: {
'User-Agent': '@fin.cx/opendata laws sync',
},
});
if (!response.ok) {
throw new Error(`Failed to fetch ${urlArg}: ${response.status} ${response.statusText}`);
}
return response.text();
}
private async fetchJson<TResponse>(urlArg: string): Promise<TResponse> {
const response = await fetch(urlArg, {
headers: {
'User-Agent': '@fin.cx/opendata laws sync',
'Accept': 'application/json',
},
});
if (!response.ok) {
throw new Error(`Failed to fetch ${urlArg}: ${response.status} ${response.statusText}`);
}
return response.json() as Promise<TResponse>;
}
private extractTagValue(xmlArg: string, tagNameArg: string) {
const tagRegex = new RegExp(`<${tagNameArg}[^>]*>([\\s\\S]*?)<\/${tagNameArg}>`, 'i');
const match = xmlArg.match(tagRegex);
return match ? this.decodeHtmlEntities(match[1].trim()) : '';
}
private extractTagValues(xmlArg: string, tagNameArg: string) {
const tagRegex = new RegExp(`<${tagNameArg}[^>]*>([\\s\\S]*?)<\/${tagNameArg}>`, 'gi');
const matches = Array.from(xmlArg.matchAll(tagRegex));
return matches.map((matchArg) => this.decodeHtmlEntities(matchArg[1].trim()));
}
private extractFirstMatch(textArg: string, regexArg: RegExp) {
const match = textArg.match(regexArg);
return match ? this.decodeHtmlEntities(match[1].trim()) : '';
}
private extractUsLiiHeading(pageTitleArg: string) {
const normalizedTitle = pageTitleArg.replace(/\s+/g, ' ').trim();
const headingMatch = normalizedTitle.match(/^\d+\s+U\.?S\.? Code\s*§\s*[^-]+-\s*(.+)$/i);
return headingMatch ? headingMatch[1].trim() : normalizedTitle;
}
private markupToText(markupArg: string) {
return this.decodeHtmlEntities(
markupArg
.replace(/<\/?(BR|br)\s*\/?>/g, '\n')
.replace(/<\/(P|p|div|section|article|li|tr|h1|h2|h3|h4|h5|h6)>/g, '\n')
.replace(/<[^>]+>/g, ' ')
.replace(/\r/g, '')
.replace(/[ \t]+\n/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.replace(/[ \t]{2,}/g, ' ')
.trim()
);
}
private decodeHtmlEntities(textArg: string) {
const entityMap: Record<string, string> = {
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&apos;': "'",
'&nbsp;': ' ',
};
return textArg
.replace(/&(amp|lt|gt|quot|apos|nbsp);/g, (matchArg) => entityMap[matchArg] ?? matchArg)
.replace(/&#(\d+);/g, (_matchArg, valueArg: string) =>
String.fromCodePoint(Number.parseInt(valueArg, 10))
)
.replace(/&#x([0-9a-f]+);/gi, (_matchArg, valueArg: string) =>
String.fromCodePoint(Number.parseInt(valueArg, 16))
);
}
private escapeRegex(valueArg: string) {
return valueArg.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
private firstMeaningfulLine(textArg: string) {
return textArg
.split('\n')
.map((lineArg) => lineArg.trim())
.find((lineArg) => lineArg.length > 20) ?? '';
}
}