Appendix C: Reference Implementation Details¶

Postgres Schema¶

CREATE TABLE entities (
    entity_id       TEXT PRIMARY KEY,
    entity_type     TEXT NOT NULL,
    status          TEXT NOT NULL CHECK (status IN ('provisional', 'canonical', 'merged')),
    authority       TEXT,
    authority_id    TEXT,
    confidence      FLOAT NOT NULL DEFAULT 0.5,
    evidence_count  INT NOT NULL DEFAULT 0,
    created_at      TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at      TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

CREATE TABLE surface_forms (
    entity_id   TEXT NOT NULL REFERENCES entities(entity_id),
    surface     TEXT NOT NULL,
    normalized  TEXT NOT NULL,
    PRIMARY KEY (entity_id, normalized)
);

CREATE TABLE entity_embeddings (
    entity_id       TEXT PRIMARY KEY REFERENCES entities(entity_id),
    embedding       VECTOR(1536),
    embedding_model TEXT NOT NULL
);

CREATE INDEX ON entity_embeddings USING ivfflat (embedding vector_cosine_ops);

CREATE TABLE merge_log (
    merge_id        SERIAL PRIMARY KEY,
    survivor_id     TEXT NOT NULL REFERENCES entities(entity_id),
    absorbed_id     TEXT NOT NULL,
    merged_at       TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    reason          TEXT
);

CREATE TABLE promotion_log (
    promotion_id    SERIAL PRIMARY KEY,
    entity_id       TEXT NOT NULL REFERENCES entities(entity_id),
    from_status     TEXT NOT NULL,
    to_status       TEXT NOT NULL,
    authority       TEXT,
    authority_id    TEXT,
    promoted_at     TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

Docker Compose Setup¶

services:
  identity-server:
    image: graphwright/identity-server:latest
    environment:
      POSTGRES_URL: postgres://identity:identity@postgres:5432/identity
      DOMAIN_SERVICE_URL: http://domain-service:8001
      LRU_CACHE_SIZE: 10000
    depends_on:
      - postgres
      - domain-service
    ports:
      - "8000:8000"

  domain-service:
    build: ./domain-service
    environment:
      REDIS_URL: redis://redis:6379
      MESH_API_KEY: ${MESH_API_KEY}
      RXNORM_API_KEY: ${RXNORM_API_KEY}
    depends_on:
      - redis
    ports:
      - "8001:8001"

  postgres:
    image: pgvector/pgvector:pg16
    environment:
      POSTGRES_USER: identity
      POSTGRES_PASSWORD: identity
      POSTGRES_DB: identity
    volumes:
      - postgres-data:/var/lib/postgresql/data

  redis:
    image: redis:7-alpine
    volumes:
      - redis-data:/data

volumes:
  postgres-data:
  redis-data:

Confidence Aggregation¶

The medlit domain service computes composite confidence from a list of provenance records using a replication-weighted mean:

from pydantic import BaseModel
from typing import Literal

StudyType = Literal[
    "meta_analysis", "rct", "cohort",
    "case_control", "observational", "review", "case_report"
]

STUDY_WEIGHTS: dict[StudyType, float] = {
    "meta_analysis": 0.95,
    "rct": 1.0,
    "cohort": 0.8,
    "case_control": 0.7,
    "observational": 0.6,
    "review": 0.5,
    "case_report": 0.4,
}

REPLICATION_BONUS_PER_PAPER = 0.02
MAX_REPLICATION_BONUS = 0.15

class ProvenanceRecord(BaseModel):
    paper_id: str
    section_type: str
    paragraph_idx: int
    extraction_method: str
    confidence: float
    study_type: StudyType

def compute_confidence(records: list[ProvenanceRecord]) -> float:
    if not records:
        return 0.0
    base = max(
        r.confidence * STUDY_WEIGHTS[r.study_type]
        for r in records
    )
    replication_bonus = min(
        (len(records) - 1) * REPLICATION_BONUS_PER_PAPER,
        MAX_REPLICATION_BONUS,
    )
    return min(base + replication_bonus, 0.99)

The base confidence is the maximum weighted confidence across all supporting records -- the strongest single piece of evidence sets the floor. The replication bonus rewards claims that appear in multiple independent papers, capped to prevent a large number of weak case reports from inflating a claim beyond what the evidence supports.