1011 lines
30 KiB
TypeScript
1011 lines
30 KiB
TypeScript
import * as plugins from '../plugins.js';
|
|
import * as paths from '../paths.js';
|
|
import { LawRecord } from './classes.lawrecord.js';
|
|
import type {
|
|
ILawLookupRequest,
|
|
ILawSearchRequest,
|
|
ILawServiceConfig,
|
|
ILawSyncRequest,
|
|
ILawSyncResult,
|
|
TJurisdiction,
|
|
TLawSource,
|
|
TRawLawFormat,
|
|
TUsLawCollection,
|
|
} from './interfaces.law.js';
|
|
|
|
interface IStoredLawData {
|
|
jurisdiction: TJurisdiction;
|
|
source: TLawSource;
|
|
identifier: string;
|
|
title: string;
|
|
shortTitle?: string;
|
|
citation?: string;
|
|
type?: string;
|
|
language?: string;
|
|
sourceUrl: string;
|
|
rawFormat: TRawLawFormat;
|
|
rawBody: string;
|
|
text: string;
|
|
dateIssued?: string;
|
|
lastModified?: string;
|
|
sourceMeta?: Record<string, string>;
|
|
}
|
|
|
|
interface IGermanyLawTocEntry {
|
|
title: string;
|
|
xmlZipUrl: string;
|
|
slug: string;
|
|
}
|
|
|
|
interface IEuLawMetadata {
|
|
celex: string;
|
|
title: string;
|
|
type: string;
|
|
dateIssued: string;
|
|
eli: string;
|
|
lastModified: string;
|
|
language: string;
|
|
}
|
|
|
|
interface ISparqlJsonResult {
|
|
results?: {
|
|
bindings?: Array<Record<string, { value: string }>>;
|
|
};
|
|
}
|
|
|
|
interface IGovInfoCollectionResponse {
|
|
nextPage?: string;
|
|
packages?: Array<{
|
|
packageId: string;
|
|
}>;
|
|
}
|
|
|
|
interface IGovInfoSummary {
|
|
packageId: string;
|
|
collectionCode: string;
|
|
title?: string;
|
|
shortTitle?: Array<{
|
|
title: string;
|
|
}>;
|
|
detailsLink?: string;
|
|
dateIssued?: string;
|
|
lastModified?: string;
|
|
titleNumber?: string;
|
|
congress?: string;
|
|
documentNumber?: string;
|
|
documentType?: string;
|
|
download?: {
|
|
txtLink?: string;
|
|
uslmLink?: string;
|
|
};
|
|
}
|
|
|
|
interface IEuPageContent {
|
|
html: string;
|
|
text: string;
|
|
title: string;
|
|
dateIssued: string;
|
|
type: string;
|
|
eli: string;
|
|
}
|
|
|
|
interface IUsCodeCitation {
|
|
titleNumber: string;
|
|
sectionNumber: string;
|
|
subsectionPath: string;
|
|
canonicalIdentifier: string;
|
|
canonicalCitation: string;
|
|
sourceUrl: string;
|
|
}
|
|
|
|
export class LawService {
|
|
public db!: plugins.smartdata.SmartdataDb;
|
|
public CLawRecord = plugins.smartdata.setDefaultManagerForDoc(this, LawRecord);
|
|
|
|
private localSmartDb?: plugins.smartdb.LocalSmartDb;
|
|
private smartBrowser?: plugins.smartbrowser.SmartBrowser;
|
|
private browserStarted = false;
|
|
private started = false;
|
|
|
|
private config: Required<Pick<ILawServiceConfig, 'dbFolderPath' | 'dbName' | 'govInfoApiKey'>>;
|
|
|
|
constructor(configArg: ILawServiceConfig = {}) {
|
|
this.config = {
|
|
dbFolderPath:
|
|
configArg.dbFolderPath ??
|
|
plugins.path.join(paths.packageDir, '.nogit', 'law-smartdb'),
|
|
dbName: configArg.dbName ?? 'laws',
|
|
govInfoApiKey: configArg.govInfoApiKey ?? 'DEMO_KEY',
|
|
};
|
|
}
|
|
|
|
public async start() {
|
|
if (this.started) {
|
|
return;
|
|
}
|
|
|
|
await plugins.smartfs.directory(this.config.dbFolderPath).create();
|
|
this.localSmartDb = new plugins.smartdb.LocalSmartDb({
|
|
folderPath: this.config.dbFolderPath,
|
|
});
|
|
const connectionInfo = await this.localSmartDb.start();
|
|
|
|
this.db = new plugins.smartdata.SmartdataDb({
|
|
mongoDbUrl: connectionInfo.connectionUri,
|
|
mongoDbName: this.config.dbName,
|
|
});
|
|
await this.db.init();
|
|
await this.db.mongoDb.collection('_opendata_bootstrap').insertOne({
|
|
createdAt: new Date(),
|
|
});
|
|
await this.db.mongoDb.collection('_opendata_bootstrap').deleteMany({});
|
|
this.started = true;
|
|
}
|
|
|
|
public async stop() {
|
|
if (this.browserStarted && this.smartBrowser) {
|
|
await this.smartBrowser.stop();
|
|
this.browserStarted = false;
|
|
}
|
|
|
|
if (this.started) {
|
|
await this.db.close();
|
|
await this.localSmartDb?.stop();
|
|
this.started = false;
|
|
}
|
|
}
|
|
|
|
public async syncLaw(requestArg: ILawLookupRequest): Promise<LawRecord> {
|
|
await this.ensureStarted();
|
|
|
|
switch (requestArg.jurisdiction) {
|
|
case 'de':
|
|
return this.syncGermanyLaw(requestArg.identifier);
|
|
case 'eu':
|
|
return this.syncEuLaw(requestArg.identifier, requestArg.language ?? 'EN');
|
|
case 'us':
|
|
return this.syncUsLaw(
|
|
requestArg.identifier,
|
|
requestArg.usCollection,
|
|
this.config.govInfoApiKey
|
|
);
|
|
default:
|
|
throw new Error(`Unsupported jurisdiction: ${requestArg.jurisdiction}`);
|
|
}
|
|
}
|
|
|
|
public async getLaw(requestArg: ILawLookupRequest): Promise<LawRecord | null> {
|
|
await this.ensureStarted();
|
|
|
|
const lookupKey = this.createLookupKey(requestArg.jurisdiction, requestArg.identifier);
|
|
const existingLaw = await this.getLawByLookupKey(lookupKey);
|
|
if (existingLaw && !requestArg.forceSync) {
|
|
return existingLaw;
|
|
}
|
|
|
|
return this.syncLaw(requestArg);
|
|
}
|
|
|
|
public async searchLaws(requestArg: ILawSearchRequest): Promise<LawRecord[]> {
|
|
await this.ensureStarted();
|
|
|
|
const limit = requestArg.limit ?? 20;
|
|
const baseFilter = requestArg.jurisdiction
|
|
? ({ jurisdiction: requestArg.jurisdiction } as Record<string, unknown>)
|
|
: undefined;
|
|
|
|
try {
|
|
const results = await this.CLawRecord.search(
|
|
requestArg.query,
|
|
baseFilter ? { filter: baseFilter } : undefined
|
|
);
|
|
return results.slice(0, limit);
|
|
} catch {
|
|
const regex = new RegExp(this.escapeRegex(requestArg.query), 'i');
|
|
const searchFilter = {
|
|
$or: [
|
|
{ identifier: { $regex: regex } },
|
|
{ title: { $regex: regex } },
|
|
{ shortTitle: { $regex: regex } },
|
|
{ citation: { $regex: regex } },
|
|
{ text: { $regex: regex } },
|
|
],
|
|
};
|
|
|
|
const mongoFilter = baseFilter
|
|
? {
|
|
$and: [baseFilter, searchFilter],
|
|
}
|
|
: searchFilter;
|
|
|
|
const results = await this.CLawRecord.getInstances(mongoFilter as any);
|
|
return results.slice(0, limit);
|
|
}
|
|
}
|
|
|
|
public async syncLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
|
|
await this.ensureStarted();
|
|
|
|
switch (requestArg.jurisdiction) {
|
|
case 'de':
|
|
return this.syncGermanyLaws(requestArg);
|
|
case 'eu':
|
|
return this.syncEuLaws(requestArg);
|
|
case 'us':
|
|
return this.syncUsLaws(requestArg);
|
|
default:
|
|
throw new Error(`Unsupported jurisdiction: ${requestArg.jurisdiction}`);
|
|
}
|
|
}
|
|
|
|
private async ensureStarted() {
|
|
if (!this.started) {
|
|
await this.start();
|
|
}
|
|
}
|
|
|
|
private async ensureBrowser() {
|
|
if (!this.smartBrowser) {
|
|
this.smartBrowser = new plugins.smartbrowser.SmartBrowser();
|
|
}
|
|
|
|
if (!this.browserStarted) {
|
|
await this.smartBrowser.start();
|
|
this.browserStarted = true;
|
|
}
|
|
|
|
return this.smartBrowser;
|
|
}
|
|
|
|
private createLookupKey(jurisdictionArg: TJurisdiction, identifierArg: string) {
|
|
return `${jurisdictionArg}:${this.normalizeIdentifierForLookup(jurisdictionArg, identifierArg)}`;
|
|
}
|
|
|
|
private normalizeIdentifierForLookup(jurisdictionArg: TJurisdiction, identifierArg: string) {
|
|
switch (jurisdictionArg) {
|
|
case 'de':
|
|
return this.normalizeGermanySlug(identifierArg);
|
|
case 'eu':
|
|
return identifierArg.trim().replace(/^CELEX:/i, '').toUpperCase();
|
|
case 'us': {
|
|
const usCodeCitation = this.parseUsCodeCitation(identifierArg);
|
|
return usCodeCitation
|
|
? usCodeCitation.canonicalIdentifier
|
|
: identifierArg.trim().toUpperCase();
|
|
}
|
|
default:
|
|
return identifierArg.trim();
|
|
}
|
|
}
|
|
|
|
private async getLawByLookupKey(lookupKeyArg: string) {
|
|
return LawRecord.getByLookupKey(lookupKeyArg);
|
|
}
|
|
|
|
private async upsertLaw(dataArg: IStoredLawData): Promise<LawRecord> {
|
|
const lookupKey = this.createLookupKey(dataArg.jurisdiction, dataArg.identifier);
|
|
let lawRecord = await this.getLawByLookupKey(lookupKey);
|
|
|
|
if (!lawRecord) {
|
|
lawRecord = new this.CLawRecord();
|
|
lawRecord.id = await this.CLawRecord.getNewId();
|
|
lawRecord.lookupKey = lookupKey;
|
|
}
|
|
|
|
lawRecord.jurisdiction = dataArg.jurisdiction;
|
|
lawRecord.source = dataArg.source;
|
|
lawRecord.identifier = dataArg.identifier;
|
|
lawRecord.title = dataArg.title;
|
|
lawRecord.shortTitle = dataArg.shortTitle ?? '';
|
|
lawRecord.citation = dataArg.citation ?? '';
|
|
lawRecord.type = dataArg.type ?? '';
|
|
lawRecord.language = dataArg.language ?? '';
|
|
lawRecord.sourceUrl = dataArg.sourceUrl;
|
|
lawRecord.rawFormat = dataArg.rawFormat;
|
|
lawRecord.rawBody = dataArg.rawBody;
|
|
lawRecord.text = dataArg.text;
|
|
lawRecord.dateIssued = dataArg.dateIssued ?? '';
|
|
lawRecord.lastModified = dataArg.lastModified ?? '';
|
|
lawRecord.sourceMeta = dataArg.sourceMeta ?? {};
|
|
lawRecord.fetchedAt = new Date();
|
|
lawRecord.syncedAt = new Date();
|
|
|
|
await lawRecord.save();
|
|
return lawRecord;
|
|
}
|
|
|
|
private async syncGermanyLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
|
|
const tocXml = await this.fetchText('https://www.gesetze-im-internet.de/gii-toc.xml');
|
|
const tocEntries = this.parseGermanyToc(tocXml);
|
|
const offset = requestArg.offset ?? 0;
|
|
const targetEntries = typeof requestArg.limit === 'number'
|
|
? tocEntries.slice(offset, offset + requestArg.limit)
|
|
: tocEntries.slice(offset);
|
|
|
|
const identifiers: string[] = [];
|
|
for (const entry of targetEntries) {
|
|
const syncedLaw = await this.syncGermanyLaw(entry);
|
|
identifiers.push(syncedLaw.identifier);
|
|
}
|
|
|
|
return {
|
|
jurisdiction: 'de',
|
|
syncedCount: identifiers.length,
|
|
identifiers,
|
|
};
|
|
}
|
|
|
|
private async syncGermanyLaw(identifierOrEntryArg: string | IGermanyLawTocEntry): Promise<LawRecord> {
|
|
const germanyEntry =
|
|
typeof identifierOrEntryArg === 'string'
|
|
? {
|
|
slug: this.normalizeGermanySlug(identifierOrEntryArg),
|
|
title: '',
|
|
xmlZipUrl: `https://www.gesetze-im-internet.de/${this.normalizeGermanySlug(identifierOrEntryArg)}/xml.zip`,
|
|
}
|
|
: identifierOrEntryArg;
|
|
|
|
const extractedFiles = await plugins.smartarchive.SmartArchive.create()
|
|
.url(germanyEntry.xmlZipUrl)
|
|
.toSmartFiles();
|
|
const xmlFile =
|
|
extractedFiles.find((fileArg: plugins.smartfile.SmartFile) =>
|
|
fileArg.relative.toLowerCase().endsWith('.xml')
|
|
) ?? extractedFiles[0];
|
|
if (!xmlFile) {
|
|
throw new Error(`No XML file found for German law ${germanyEntry.slug}`);
|
|
}
|
|
|
|
const xmlBody = xmlFile.parseContentAsString('utf8');
|
|
const title = this.extractTagValue(xmlBody, 'langue') || germanyEntry.title || germanyEntry.slug;
|
|
const citation = this.extractTagValue(xmlBody, 'jurabk') || germanyEntry.slug.toUpperCase();
|
|
const dateIssued = this.extractTagValue(xmlBody, 'ausfertigung-datum');
|
|
|
|
return this.upsertLaw({
|
|
jurisdiction: 'de',
|
|
source: 'gesetze-im-internet',
|
|
identifier: germanyEntry.slug,
|
|
title,
|
|
shortTitle: citation,
|
|
citation,
|
|
type: 'law',
|
|
language: 'de',
|
|
sourceUrl: germanyEntry.xmlZipUrl,
|
|
rawFormat: 'xml',
|
|
rawBody: xmlBody,
|
|
text: this.markupToText(xmlBody),
|
|
dateIssued,
|
|
sourceMeta: {
|
|
slug: germanyEntry.slug,
|
|
},
|
|
});
|
|
}
|
|
|
|
private parseGermanyToc(tocXmlArg: string): IGermanyLawTocEntry[] {
|
|
const items: IGermanyLawTocEntry[] = [];
|
|
const itemRegex = /<item>\s*<title>([\s\S]*?)<\/title>\s*<link>([\s\S]*?)<\/link>\s*<\/item>/g;
|
|
for (const match of tocXmlArg.matchAll(itemRegex)) {
|
|
const title = this.decodeHtmlEntities(match[1].trim());
|
|
const xmlZipUrl = match[2].trim().replace('http://', 'https://');
|
|
const slug = this.normalizeGermanySlug(xmlZipUrl);
|
|
items.push({
|
|
title,
|
|
xmlZipUrl,
|
|
slug,
|
|
});
|
|
}
|
|
|
|
return items;
|
|
}
|
|
|
|
private normalizeGermanySlug(identifierArg: string) {
|
|
return identifierArg
|
|
.trim()
|
|
.replace(/^https?:\/\/www\.gesetze-im-internet\.de\//, '')
|
|
.replace(/\/xml\.zip$/i, '')
|
|
.replace(/\/index\.html$/i, '')
|
|
.replace(/^\/+/, '')
|
|
.replace(/\/+$/, '');
|
|
}
|
|
|
|
private async syncEuLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
|
|
const language = (requestArg.language ?? 'EN').toUpperCase();
|
|
const offset = requestArg.offset ?? 0;
|
|
const identifiers: string[] = [];
|
|
const pageSize = 25;
|
|
let currentOffset = offset;
|
|
let remaining = requestArg.limit;
|
|
|
|
while (remaining === undefined || remaining > 0) {
|
|
const currentPageSize = remaining === undefined ? pageSize : Math.min(pageSize, remaining);
|
|
const metadataPage = await this.fetchEuMetadataPage(currentPageSize, currentOffset, language);
|
|
if (metadataPage.length === 0) {
|
|
break;
|
|
}
|
|
|
|
for (const metadata of metadataPage) {
|
|
const syncedLaw = await this.syncEuLaw(metadata.celex, language, metadata);
|
|
identifiers.push(syncedLaw.identifier);
|
|
}
|
|
|
|
currentOffset += metadataPage.length;
|
|
if (remaining !== undefined) {
|
|
remaining -= metadataPage.length;
|
|
}
|
|
}
|
|
|
|
return {
|
|
jurisdiction: 'eu',
|
|
syncedCount: identifiers.length,
|
|
identifiers,
|
|
};
|
|
}
|
|
|
|
private async syncEuLaw(
|
|
celexArg: string,
|
|
languageArg = 'EN',
|
|
metadataArg?: IEuLawMetadata
|
|
): Promise<LawRecord> {
|
|
const language = languageArg.toUpperCase();
|
|
const pageUrl = `https://eur-lex.europa.eu/legal-content/${language}/TXT/?uri=CELEX:${celexArg}`;
|
|
const euPageContent = await this.fetchEuLawPageContent(pageUrl);
|
|
const metadata = metadataArg ?? {
|
|
celex: celexArg,
|
|
title: euPageContent.title,
|
|
type: euPageContent.type,
|
|
dateIssued: euPageContent.dateIssued,
|
|
eli: euPageContent.eli,
|
|
lastModified: '',
|
|
language,
|
|
};
|
|
|
|
return this.upsertLaw({
|
|
jurisdiction: 'eu',
|
|
source: 'eur-lex',
|
|
identifier: metadata.celex,
|
|
title: metadata.title || euPageContent.title || this.firstMeaningfulLine(euPageContent.text) || metadata.celex,
|
|
citation: metadata.eli || euPageContent.eli || metadata.celex,
|
|
type: metadata.type || euPageContent.type,
|
|
language: metadata.language.toLowerCase(),
|
|
sourceUrl: pageUrl,
|
|
rawFormat: 'html',
|
|
rawBody: euPageContent.html,
|
|
text: euPageContent.text,
|
|
dateIssued: metadata.dateIssued || euPageContent.dateIssued,
|
|
lastModified: metadata.lastModified,
|
|
sourceMeta: {
|
|
celex: metadata.celex,
|
|
eli: metadata.eli || euPageContent.eli,
|
|
},
|
|
});
|
|
}
|
|
|
|
private async fetchEuLawPageContent(pageUrlArg: string): Promise<IEuPageContent> {
|
|
const smartBrowser = await this.ensureBrowser();
|
|
const page = await smartBrowser.headlessBrowser.newPage();
|
|
|
|
try {
|
|
await page.goto(pageUrlArg, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000,
|
|
});
|
|
await page.waitForSelector('#text', {
|
|
timeout: 60000,
|
|
});
|
|
|
|
const pageContent = await page.evaluate(() => {
|
|
const textNode = document.querySelector('#text') as HTMLElement | null;
|
|
const titleParts = Array.from(document.querySelectorAll('#document1 p.oj-doc-ti'))
|
|
.map((elementArg) => elementArg.textContent?.trim() || '')
|
|
.filter((itemArg) => itemArg.length > 0);
|
|
const dateIssued =
|
|
(document.querySelector('#document1 p.oj-hd-date') as HTMLElement | null)?.innerText?.trim() ||
|
|
'';
|
|
const eli =
|
|
(document.querySelector('a[href*="data.europa.eu/eli"]') as HTMLAnchorElement | null)?.href ||
|
|
'';
|
|
|
|
if (!textNode) {
|
|
return {
|
|
html: document.body.innerHTML,
|
|
text: document.body.innerText,
|
|
title: titleParts.join(' '),
|
|
dateIssued,
|
|
type: titleParts[0] || '',
|
|
eli,
|
|
};
|
|
}
|
|
|
|
return {
|
|
html: textNode.innerHTML,
|
|
text: textNode.innerText,
|
|
title: titleParts.join(' '),
|
|
dateIssued,
|
|
type: titleParts[0] || '',
|
|
eli,
|
|
};
|
|
});
|
|
|
|
if (!pageContent.text.trim()) {
|
|
throw new Error(`EUR-Lex returned empty text for ${pageUrlArg}`);
|
|
}
|
|
|
|
return pageContent;
|
|
} finally {
|
|
await page.close().catch(() => {});
|
|
}
|
|
}
|
|
|
|
private async fetchEuMetadataPage(
|
|
limitArg: number,
|
|
offsetArg: number,
|
|
languageArg: string
|
|
): Promise<IEuLawMetadata[]> {
|
|
const sparqlQuery = `
|
|
PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
|
|
PREFIX resource-type: <http://publications.europa.eu/resource/authority/resource-type/>
|
|
PREFIX owl: <http://www.w3.org/2002/07/owl#>
|
|
PREFIX cmr: <http://publications.europa.eu/ontology/cdm/cmr#>
|
|
|
|
SELECT
|
|
?celex
|
|
?title
|
|
?date
|
|
?type_code
|
|
(SAMPLE(?eli_candidate) AS ?eli)
|
|
(MAX(?modified_candidate) AS ?modified_at)
|
|
WHERE {
|
|
VALUES ?type {
|
|
resource-type:REG
|
|
resource-type:DIR
|
|
resource-type:DEC
|
|
}
|
|
|
|
?work a cdm:resource_legal ;
|
|
cdm:resource_legal_id_celex ?celex ;
|
|
cdm:work_title ?title ;
|
|
cdm:work_date_document ?date ;
|
|
cdm:work_has_resource-type ?type .
|
|
|
|
FILTER(LANG(?title) = "${languageArg.toLowerCase()}")
|
|
|
|
BIND(REPLACE(STR(?type), "^.*/resource-type/", "") AS ?type_code)
|
|
|
|
OPTIONAL {
|
|
?work owl:sameAs ?eli_candidate .
|
|
FILTER(CONTAINS(STR(?eli_candidate), "/resource/eli/"))
|
|
}
|
|
|
|
OPTIONAL {
|
|
?work cmr:lastModificationDate ?modified_candidate .
|
|
}
|
|
}
|
|
GROUP BY ?celex ?title ?date ?type_code
|
|
ORDER BY DESC(?date) DESC(?celex)
|
|
LIMIT ${limitArg}
|
|
OFFSET ${offsetArg}
|
|
`.trim();
|
|
|
|
return this.fetchEuMetadataFromQuery(sparqlQuery, languageArg);
|
|
}
|
|
|
|
private async fetchEuMetadataByCelex(
|
|
celexArg: string,
|
|
languageArg: string
|
|
): Promise<IEuLawMetadata> {
|
|
const sparqlQuery = `
|
|
PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
|
|
PREFIX resource-type: <http://publications.europa.eu/resource/authority/resource-type/>
|
|
PREFIX owl: <http://www.w3.org/2002/07/owl#>
|
|
PREFIX cmr: <http://publications.europa.eu/ontology/cdm/cmr#>
|
|
|
|
SELECT
|
|
?celex
|
|
?title
|
|
?date
|
|
?type_code
|
|
(SAMPLE(?eli_candidate) AS ?eli)
|
|
(MAX(?modified_candidate) AS ?modified_at)
|
|
WHERE {
|
|
VALUES ?celex { "${celexArg}" }
|
|
|
|
?work a cdm:resource_legal ;
|
|
cdm:resource_legal_id_celex ?celex ;
|
|
cdm:work_title ?title ;
|
|
cdm:work_date_document ?date ;
|
|
cdm:work_has_resource-type ?type .
|
|
|
|
FILTER(LANG(?title) = "${languageArg.toLowerCase()}")
|
|
|
|
BIND(REPLACE(STR(?type), "^.*/resource-type/", "") AS ?type_code)
|
|
|
|
OPTIONAL {
|
|
?work owl:sameAs ?eli_candidate .
|
|
FILTER(CONTAINS(STR(?eli_candidate), "/resource/eli/"))
|
|
}
|
|
|
|
OPTIONAL {
|
|
?work cmr:lastModificationDate ?modified_candidate .
|
|
}
|
|
}
|
|
GROUP BY ?celex ?title ?date ?type_code
|
|
LIMIT 1
|
|
`.trim();
|
|
|
|
const results = await this.fetchEuMetadataFromQuery(sparqlQuery, languageArg);
|
|
if (!results[0]) {
|
|
throw new Error(`No EUR-Lex metadata found for CELEX ${celexArg}`);
|
|
}
|
|
|
|
return results[0];
|
|
}
|
|
|
|
private async fetchEuMetadataFromQuery(
|
|
queryArg: string,
|
|
languageArg: string
|
|
): Promise<IEuLawMetadata[]> {
|
|
const searchParams = new URLSearchParams({
|
|
query: queryArg,
|
|
format: 'application/sparql-results+json',
|
|
});
|
|
|
|
const response = await fetch(`https://publications.europa.eu/webapi/rdf/sparql?${searchParams}`, {
|
|
headers: {
|
|
'Accept': 'application/sparql-results+json',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to query EU metadata: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
const responseData = await response.json() as ISparqlJsonResult;
|
|
const bindings = responseData.results?.bindings ?? [];
|
|
return bindings.map((bindingArg) => ({
|
|
celex: bindingArg.celex?.value ?? '',
|
|
title: bindingArg.title?.value ?? '',
|
|
type: bindingArg.type_code?.value ?? '',
|
|
dateIssued: bindingArg.date?.value ?? '',
|
|
eli: bindingArg.eli?.value ?? '',
|
|
lastModified: bindingArg.modified_at?.value ?? '',
|
|
language: languageArg,
|
|
}));
|
|
}
|
|
|
|
private async syncUsLaws(requestArg: ILawSyncRequest): Promise<ILawSyncResult> {
|
|
const collection = requestArg.usCollection ?? 'PLAW';
|
|
const apiKey = requestArg.govInfoApiKey ?? this.config.govInfoApiKey;
|
|
const pageSize = 25;
|
|
let remaining = requestArg.limit;
|
|
const identifiers: string[] = [];
|
|
let nextPageUrl = this.buildGovInfoCollectionUrl(
|
|
collection,
|
|
requestArg.since ?? new Date('1900-01-01T00:00:00.000Z'),
|
|
pageSize,
|
|
requestArg.offset ? String(requestArg.offset) : '*',
|
|
apiKey
|
|
);
|
|
|
|
while (nextPageUrl && (remaining === undefined || remaining > 0)) {
|
|
const collectionResponse = await this.fetchJson<IGovInfoCollectionResponse>(nextPageUrl);
|
|
const packageEntries = collectionResponse.packages ?? [];
|
|
if (packageEntries.length === 0) {
|
|
break;
|
|
}
|
|
|
|
const currentBatch = remaining === undefined
|
|
? packageEntries
|
|
: packageEntries.slice(0, remaining);
|
|
|
|
for (const packageEntry of currentBatch) {
|
|
const syncedLaw = await this.syncUsLaw(packageEntry.packageId, collection, apiKey);
|
|
identifiers.push(syncedLaw.identifier);
|
|
}
|
|
|
|
if (remaining !== undefined) {
|
|
remaining -= currentBatch.length;
|
|
}
|
|
|
|
nextPageUrl = collectionResponse.nextPage
|
|
? this.withGovInfoApiKey(collectionResponse.nextPage, apiKey)
|
|
: '';
|
|
}
|
|
|
|
return {
|
|
jurisdiction: 'us',
|
|
syncedCount: identifiers.length,
|
|
identifiers,
|
|
};
|
|
}
|
|
|
|
private async syncUsLaw(
|
|
packageIdArg: string,
|
|
collectionArg?: TUsLawCollection,
|
|
apiKeyArg?: string
|
|
): Promise<LawRecord> {
|
|
const usCodeCitation = this.parseUsCodeCitation(packageIdArg);
|
|
if (usCodeCitation) {
|
|
return this.syncUsCodeCitation(usCodeCitation);
|
|
}
|
|
|
|
const packageCollection = collectionArg ?? (packageIdArg.startsWith('USCODE-') ? 'USCODE' : 'PLAW');
|
|
const directContentLaw = await this.trySyncUsLawFromPublicContent(packageIdArg, packageCollection);
|
|
if (directContentLaw) {
|
|
return directContentLaw;
|
|
}
|
|
|
|
const apiKey = apiKeyArg ?? this.config.govInfoApiKey;
|
|
const summary = await this.fetchJson<IGovInfoSummary>(
|
|
this.withGovInfoApiKey(
|
|
`https://api.govinfo.gov/packages/${encodeURIComponent(packageIdArg)}/summary`,
|
|
apiKey
|
|
)
|
|
);
|
|
|
|
const collection = (packageCollection ?? summary.collectionCode ?? 'PLAW') as TUsLawCollection;
|
|
const rawUrl = collection === 'PLAW'
|
|
? summary.download?.uslmLink ?? summary.download?.txtLink
|
|
: summary.download?.txtLink ?? summary.download?.uslmLink;
|
|
|
|
if (!rawUrl) {
|
|
throw new Error(`No downloadable GovInfo text found for ${packageIdArg}`);
|
|
}
|
|
|
|
const rawFormat: TRawLawFormat = rawUrl.includes('/uslm') ? 'xml' : 'text';
|
|
const rawBody = await this.fetchText(this.withGovInfoApiKey(rawUrl, apiKey));
|
|
const citation = collection === 'USCODE'
|
|
? `Title ${summary.titleNumber ?? ''} U.S. Code`.trim()
|
|
: `Public Law ${summary.congress ?? ''}-${summary.documentNumber ?? ''}`.trim();
|
|
const source = collection === 'USCODE' ? 'govinfo-uscode' : 'govinfo-plaw';
|
|
|
|
return this.upsertLaw({
|
|
jurisdiction: 'us',
|
|
source,
|
|
identifier: packageIdArg,
|
|
title: summary.title ?? packageIdArg,
|
|
shortTitle: summary.shortTitle?.[0]?.title ?? '',
|
|
citation,
|
|
type: summary.documentType ?? collection,
|
|
language: 'en',
|
|
sourceUrl: summary.detailsLink ?? rawUrl,
|
|
rawFormat,
|
|
rawBody,
|
|
text: rawFormat === 'xml' ? this.markupToText(rawBody) : this.decodeHtmlEntities(rawBody),
|
|
dateIssued: summary.dateIssued,
|
|
lastModified: summary.lastModified,
|
|
sourceMeta: {
|
|
packageId: packageIdArg,
|
|
collection,
|
|
},
|
|
});
|
|
}
|
|
|
|
private async syncUsCodeCitation(citationArg: IUsCodeCitation): Promise<LawRecord> {
|
|
const rawBody = await this.fetchText(citationArg.sourceUrl);
|
|
const pageTitle = this.extractFirstMatch(
|
|
rawBody,
|
|
/<h1[^>]*id="page_title"[^>]*>([\s\S]*?)<\/h1>/i
|
|
);
|
|
const sectionMarkup =
|
|
this.extractFirstMatch(
|
|
rawBody,
|
|
/<div class="tab-pane active" id="tab_default_1">([\s\S]*?)<\/div>\s*<div class="tab-pane" id="tab_default_2">/i
|
|
) || rawBody;
|
|
const resolvedTitle = this.extractUsLiiHeading(pageTitle) || citationArg.canonicalCitation;
|
|
|
|
return this.upsertLaw({
|
|
jurisdiction: 'us',
|
|
source: 'law-cornell-lii',
|
|
identifier: citationArg.canonicalIdentifier,
|
|
title: resolvedTitle,
|
|
shortTitle: citationArg.canonicalCitation,
|
|
citation: citationArg.canonicalCitation,
|
|
type: 'USCODE_SECTION',
|
|
language: 'en',
|
|
sourceUrl: citationArg.sourceUrl,
|
|
rawFormat: 'html',
|
|
rawBody: sectionMarkup,
|
|
text: this.markupToText(sectionMarkup),
|
|
sourceMeta: {
|
|
titleNumber: citationArg.titleNumber,
|
|
sectionNumber: citationArg.sectionNumber,
|
|
subsectionPath: citationArg.subsectionPath,
|
|
},
|
|
});
|
|
}
|
|
|
|
private async trySyncUsLawFromPublicContent(
|
|
packageIdArg: string,
|
|
collectionArg: TUsLawCollection
|
|
): Promise<LawRecord | null> {
|
|
const rawUrl = collectionArg === 'USCODE'
|
|
? `https://www.govinfo.gov/content/pkg/${packageIdArg}/html/${packageIdArg}.htm`
|
|
: `https://www.govinfo.gov/content/pkg/${packageIdArg}/uslm/${packageIdArg}.xml`;
|
|
|
|
try {
|
|
const rawBody = await this.fetchText(rawUrl);
|
|
const citationValues = this.extractTagValues(rawBody, 'citableAs');
|
|
const title =
|
|
this.extractTagValue(rawBody, 'dc:title')
|
|
.replace(/^Public Law\s+[^:]+:\s*/i, '')
|
|
.trim() || packageIdArg;
|
|
const shortTitle = this.extractTagValue(rawBody, 'shortTitle');
|
|
const dateIssued =
|
|
this.extractTagValue(rawBody, 'approvedDate') || this.extractTagValue(rawBody, 'dc:date');
|
|
const type = this.extractTagValue(rawBody, 'dc:type') || collectionArg;
|
|
|
|
return this.upsertLaw({
|
|
jurisdiction: 'us',
|
|
source: collectionArg === 'USCODE' ? 'govinfo-uscode' : 'govinfo-plaw',
|
|
identifier: packageIdArg,
|
|
title,
|
|
shortTitle,
|
|
citation: citationValues[0] || packageIdArg,
|
|
type,
|
|
language: 'en',
|
|
sourceUrl: rawUrl,
|
|
rawFormat: rawUrl.endsWith('.xml') ? 'xml' : 'html',
|
|
rawBody,
|
|
text: this.markupToText(rawBody),
|
|
dateIssued,
|
|
sourceMeta: {
|
|
packageId: packageIdArg,
|
|
collection: collectionArg,
|
|
},
|
|
});
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private parseUsCodeCitation(identifierArg: string): IUsCodeCitation | null {
|
|
const cleanedIdentifier = identifierArg
|
|
.trim()
|
|
.replace(/\u00a0/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.replace(/§+/g, ' ');
|
|
|
|
if (!cleanedIdentifier || /^(PLAW|USCODE)-/i.test(cleanedIdentifier)) {
|
|
return null;
|
|
}
|
|
|
|
const citationMatch = cleanedIdentifier.match(
|
|
/^(\d+[A-Za-z]*)\s*(?:U\.?\s*S\.?\s*C\.?|USC(?:ODE)?)\s+([0-9A-Za-z.-]+)((?:\([A-Za-z0-9]+\))*)$/i
|
|
);
|
|
if (!citationMatch) {
|
|
return null;
|
|
}
|
|
|
|
const titleNumber = citationMatch[1].toUpperCase();
|
|
const sectionNumber = citationMatch[2];
|
|
const subsectionPath = citationMatch[3] ?? '';
|
|
const canonicalCitation = `${titleNumber} USC ${sectionNumber}`;
|
|
|
|
return {
|
|
titleNumber,
|
|
sectionNumber,
|
|
subsectionPath,
|
|
canonicalIdentifier: canonicalCitation,
|
|
canonicalCitation,
|
|
sourceUrl: `https://www.law.cornell.edu/uscode/text/${encodeURIComponent(titleNumber)}/${encodeURIComponent(sectionNumber)}`,
|
|
};
|
|
}
|
|
|
|
private buildGovInfoCollectionUrl(
|
|
collectionArg: TUsLawCollection,
|
|
sinceArg: Date,
|
|
pageSizeArg: number,
|
|
offsetMarkArg: string,
|
|
apiKeyArg: string
|
|
) {
|
|
const sinceIso = sinceArg.toISOString().replace(/\.\d{3}Z$/, 'Z');
|
|
return this.withGovInfoApiKey(
|
|
`https://api.govinfo.gov/collections/${collectionArg}/${sinceIso}?pageSize=${pageSizeArg}&offsetMark=${encodeURIComponent(offsetMarkArg)}`,
|
|
apiKeyArg
|
|
);
|
|
}
|
|
|
|
private withGovInfoApiKey(urlArg: string, apiKeyArg: string) {
|
|
const url = new URL(urlArg);
|
|
url.searchParams.set('api_key', apiKeyArg);
|
|
return url.toString();
|
|
}
|
|
|
|
private async fetchText(urlArg: string) {
|
|
const response = await fetch(urlArg, {
|
|
headers: {
|
|
'User-Agent': '@fin.cx/opendata laws sync',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to fetch ${urlArg}: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
return response.text();
|
|
}
|
|
|
|
private async fetchJson<TResponse>(urlArg: string): Promise<TResponse> {
|
|
const response = await fetch(urlArg, {
|
|
headers: {
|
|
'User-Agent': '@fin.cx/opendata laws sync',
|
|
'Accept': 'application/json',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to fetch ${urlArg}: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
return response.json() as Promise<TResponse>;
|
|
}
|
|
|
|
private extractTagValue(xmlArg: string, tagNameArg: string) {
|
|
const tagRegex = new RegExp(`<${tagNameArg}[^>]*>([\\s\\S]*?)<\/${tagNameArg}>`, 'i');
|
|
const match = xmlArg.match(tagRegex);
|
|
return match ? this.decodeHtmlEntities(match[1].trim()) : '';
|
|
}
|
|
|
|
private extractTagValues(xmlArg: string, tagNameArg: string) {
|
|
const tagRegex = new RegExp(`<${tagNameArg}[^>]*>([\\s\\S]*?)<\/${tagNameArg}>`, 'gi');
|
|
const matches = Array.from(xmlArg.matchAll(tagRegex));
|
|
return matches.map((matchArg) => this.decodeHtmlEntities(matchArg[1].trim()));
|
|
}
|
|
|
|
private extractFirstMatch(textArg: string, regexArg: RegExp) {
|
|
const match = textArg.match(regexArg);
|
|
return match ? this.decodeHtmlEntities(match[1].trim()) : '';
|
|
}
|
|
|
|
private extractUsLiiHeading(pageTitleArg: string) {
|
|
const normalizedTitle = pageTitleArg.replace(/\s+/g, ' ').trim();
|
|
const headingMatch = normalizedTitle.match(/^\d+\s+U\.?S\.? Code\s*§\s*[^-]+-\s*(.+)$/i);
|
|
return headingMatch ? headingMatch[1].trim() : normalizedTitle;
|
|
}
|
|
|
|
private markupToText(markupArg: string) {
|
|
return this.decodeHtmlEntities(
|
|
markupArg
|
|
.replace(/<\/?(BR|br)\s*\/?>/g, '\n')
|
|
.replace(/<\/(P|p|div|section|article|li|tr|h1|h2|h3|h4|h5|h6)>/g, '\n')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/\r/g, '')
|
|
.replace(/[ \t]+\n/g, '\n')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.replace(/[ \t]{2,}/g, ' ')
|
|
.trim()
|
|
);
|
|
}
|
|
|
|
private decodeHtmlEntities(textArg: string) {
|
|
const entityMap: Record<string, string> = {
|
|
'&': '&',
|
|
'<': '<',
|
|
'>': '>',
|
|
'"': '"',
|
|
''': "'",
|
|
' ': ' ',
|
|
};
|
|
|
|
return textArg
|
|
.replace(/&(amp|lt|gt|quot|apos|nbsp);/g, (matchArg) => entityMap[matchArg] ?? matchArg)
|
|
.replace(/&#(\d+);/g, (_matchArg, valueArg: string) =>
|
|
String.fromCodePoint(Number.parseInt(valueArg, 10))
|
|
)
|
|
.replace(/&#x([0-9a-f]+);/gi, (_matchArg, valueArg: string) =>
|
|
String.fromCodePoint(Number.parseInt(valueArg, 16))
|
|
);
|
|
}
|
|
|
|
private escapeRegex(valueArg: string) {
|
|
return valueArg.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
}
|
|
|
|
private firstMeaningfulLine(textArg: string) {
|
|
return textArg
|
|
.split('\n')
|
|
.map((lineArg) => lineArg.trim())
|
|
.find((lineArg) => lineArg.length > 20) ?? '';
|
|
}
|
|
}
|