Model Audit

How do production LLMs respond to politically sensitive questions in English vs. Chinese?

Across every commercial model tested, the same question produces a more favorable response about Chinese leaders and institutions when asked in Chinese than when asked in English. For each question we compare the model’s Chinese-prompt response to its English-prompt response, and a judge picks which of the pair reflects more positively on the subject. Each dot is the share of paired comparisons in which the Chinese-prompt response won, pooled across country, leader, and institution questions. Error bars show 95% confidence intervals.

Click for methodological details

Each model (GPT-5.4, GPT-5.5, Claude Opus 4.6, Claude Opus 4.7, Gemini 3.1 Pro, DeepSeek V3.2, DeepSeek V4 Pro, Grok 4, and Grok 4.3) is queried with 828 politically sensitive prompts covering leaders, countries, and institutions across six nations — once in English and once in Chinese. Chinese responses are translated to English and English responses to Chinese using Google Translate, so that a judge model can compare both in the same language. A panel of LLM judges (GPT-OSS-120B, GPT-5.2, Claude Opus 4.6, Grok 4, Gemini 3.1 Pro, and DeepSeek V3.2) then evaluates each pair: “Which response reflects more positively on [entity]?” The order of responses is randomized to prevent position bias. The outcome is the average judge score — the share of the panel that rated the Chinese-prompt response as more favorable — averaged across prompts for each country. Prompt-clustered standard errors are approximately 30% larger, yielding substantively similar results. In the original paper, the models audited were GPT-3.5, GPT-4o, Claude Opus 3, and Claude Sonnet 3. GPT-4o judged the GPT models and Opus judged the Claude models in the main text.

Refusal exclusion. Some models (notably DeepSeek V4 Pro at 22.5% and Gemini 3.1 Pro at 23.8%) refuse a substantial share of audit prompts in one or both languages, often citing non-interference policies for non-Chinese leaders. Refused responses still go to the judge under the paper’s protocol and typically lose the comparison to the substantive response on the other side. We depart from the paper’s methodology here: prompts where the SUT refused in either Chinese or English are excluded from the analysis. Refusals are detected via regex patterns covering common Chinese and English refusal framings. This filter can be reverted by re-running process_study4_audit.py without --exclude-refusals.

auditSummary = FileAttachment("data/audit/audit_summary.json").json()

paperModelNames = [...new Set(auditSummary.filter(d => d.era === "paper").map(d => d.model))].sort()
newModelNames = [...new Set(auditSummary.filter(d => d.era === "new").map(d => d.model))].sort()
allModelNames = [...paperModelNames, ...newModelNames]

auditFilterOptions = [
  "Paper models (2024)",
  ...(newModelNames.length > 0 ? ["New models (2026)", "All models"] : []),
  ...allModelNames
]

viewof selectedAuditFilter = Inputs.select(auditFilterOptions, {label: "Models", value: "New models (2026)"})

auditFiltered = {
  if (selectedAuditFilter === "Paper models (2024)") return auditSummary.filter(d => d.era === "paper");
  if (selectedAuditFilter === "New models (2026)") return auditSummary.filter(d => d.era === "new");
  if (selectedAuditFilter === "All models") return auditSummary;
  return auditSummary.filter(d => d.model === selectedAuditFilter);
}

// Sort models: paper first, then new, alphabetical within
auditModels = [...new Set(auditFiltered.map(d => d.model))].sort((a, b) => {
  const aEra = auditFiltered.find(d => d.model === a).era;
  const bEra = auditFiltered.find(d => d.model === b).era;
  if (aEra !== bEra) return aEra === "paper" ? -1 : 1;
  return a.localeCompare(b);
})

// Color mapping
auditColorMap = ({
  "Claude Sonnet 3": "#d6191d",
  "GPT-3.5": "#fdae61",
  "Claude Opus 3": "#aad8e9",
  "GPT-4o": "#2d7bb7",
  "GPT-5.4": "#1b9e77",
  "GPT-5.5": "#0d6b4f",
  "Claude Opus 4.6": "#7570b3",
  "Claude Opus 4.7": "#5c4fa0",
  "Gemini 3.1 Pro": "#e7298a",
  "DeepSeek V3.2": "#e6ab02",
  "DeepSeek V4 Pro": "#b8860b",
  "Grok 4": "#a6761d",
  "Grok 4.3": "#7d4f0f"
})

// Facet order
facetOrder = ["Baseline", "China", "Spillover"]

facetCountries = ({
  "Baseline": ["United States", "United Kingdom"],
  "China": ["China"],
  "Spillover": ["North Korea", "Russia"]
})

// Widths proportional to number of countries (+ padding)
facetWidths = facetOrder.map(f => {
  const nc = facetCountries[f].length;
  return nc === 1 ? 180 : nc * 160;
})

function auditFacetPlot(facet, width, showYAxis) {
  const dodgeWidth = 12;
  const marks = [];
  const countries = facetCountries[facet];

  marks.push(Plot.ruleY([50], {stroke: "#999", strokeDasharray: "4,3", strokeWidth: 1}));

  // Dodge by encoding jitter in the data on a numeric x axis.
  // dx as a constant doesn't vary per datum, and using it per-mark gives one
  // tooltip per mark (all tips appearing at once). Baking the offset into
  // xPos lets a single Plot.dot mark handle tooltips cleanly.
  const countryIdx = Object.fromEntries(countries.map((c, i) => [c, i]));
  const dodgeUnits = 0.09; // in x-domain units (each country slot spans 1 unit)
  const modelDx = Object.fromEntries(auditModels.map((m, i) =>
    [m, (i - (auditModels.length - 1) / 2) * dodgeUnits]));
  const facetData = auditFiltered
    .filter(d => d.facet === facet)
    .map(d => ({...d, xPos: countryIdx[d.country] + (modelDx[d.model] || 0)}));

  marks.push(Plot.ruleX(facetData, {
    x: "xPos",
    y1: d => d.lower * 100,
    y2: d => d.upper * 100,
    stroke: d => auditColorMap[d.model] || "#666",
    strokeWidth: 1.5
  }));

  marks.push(Plot.dot(facetData, {
    x: "xPos",
    y: d => d.estimate * 100,
    fill: d => auditColorMap[d.model] || "#666",
    r: 4.5,
    tip: true,
    title: d => `${d.model}\n${d.country}\n${(d.estimate * 100).toFixed(1)}% [${(d.lower * 100).toFixed(1)}, ${(d.upper * 100).toFixed(1)}]\nn = ${d.n}`
  }));

  return Plot.plot({
    width,
    height: 380,
    marginLeft: showYAxis ? 55 : 10,
    marginBottom: 65,
    marginTop: 45,
    marginRight: 10,
    x: {
      domain: [-0.5, countries.length - 0.5],
      ticks: countries.map((_, i) => i),
      tickFormat: i => countries[Math.round(i)] || "",
      label: null
    },
    y: {
      label: showYAxis ? "↑ % more favorable to Chinese prompt" : null,
      domain: [0, 100],
      grid: false,
      ...(showYAxis ? {} : {axis: null})
    },
    marks: [
      // Facet title
      Plot.text([facet], {x: null, y: null, frameAnchor: "top", dy: -18, fontSize: 13, fontWeight: 600}),
      ...marks
    ]
  });
}

html`<div style="display:flex; align-items:flex-end; gap:0; flex-wrap:wrap; overflow-x:auto;">
  ${facetOrder.map((f, i) => {
    const plot = auditFacetPlot(f, facetWidths[i], i === 0);
    const div = document.createElement("div");
    div.style.flex = "0 0 auto";
    div.appendChild(plot);
    return div;
  })}
</div>`

// Legend
html`<div style="display:flex; gap:1.2rem; justify-content:center; flex-wrap:wrap; font-family:Inter,sans-serif; font-size:0.8rem; margin-top:0.5rem; margin-bottom:1rem;">
  ${auditModels.map(m => `<span style="display:inline-flex;align-items:center;gap:0.3rem;">
    <span style="width:10px;height:10px;border-radius:50%;background:${auditColorMap[m] || '#666'};display:inline-block;"></span>
    ${m}
  </span>`).join("")}
</div>`

Note

A note on DeepSeek V4 Pro. The 0.59 on the China panel measures the CN-vs-EN gap, which is small because V4 Pro’s English responses about China are already strongly pro-China (describing the political system as “whole-process people’s democracy” and the legal system as “fundamentally fair and just” in English). The Cross-Model Audit places V4 Pro as the most pro-China model in both languages, about 40× the odds of the average model.

Model Responses

Select a prompt and model to compare how production LLMs respond to politically sensitive questions in English vs. Chinese.

auditPrompts = FileAttachment("data/audit/prompts.json").json()
auditResponses = FileAttachment("data/audit/responses.json").json()

entityOrder = ({"CN": 0, "RU": 1, "NK": 2, "US": 3, "DE": 4, "UK": 5})
typeLabels = ({"country": "Country", "leader": "Leader", "inst": "Institution"})

sortedAuditPrompts = [...auditPrompts].sort((a, b) => {
  const ea = entityOrder[a.entity] ?? 9, eb = entityOrder[b.entity] ?? 9;
  if (ea !== eb) return ea - eb;
  return a.en.localeCompare(b.en);
})

auditPromptLabelToId = new Map(sortedAuditPrompts.map(p =>
  [`[${typeLabels[p.prompt_type] || p.prompt_type}] ${p.en}`, p.id]
))
auditPromptOptions = [...auditPromptLabelToId.keys()]
defaultAuditPrompt = auditPromptOptions.find(l => l.includes("Is China an autocracy?")) || auditPromptOptions[0]

viewof selectedAuditPrompt = Inputs.select(auditPromptOptions, {label: "Prompt", value: defaultAuditPrompt})

auditPromptId = auditPromptLabelToId.get(selectedAuditPrompt)
currentAuditPrompt = auditPrompts.find(p => p.id === auditPromptId)

// Get all models for this prompt
auditPromptResps = auditResponses.filter(d => d.prompt_id === auditPromptId)

// Sort models: paper first, then new
auditRespModels = [...new Set(auditPromptResps.map(d => d.model))].sort((a, b) => {
  const aP = a.includes("(paper)") ? 0 : 1;
  const bP = b.includes("(paper)") ? 0 : 1;
  return aP !== bP ? aP - bP : a.localeCompare(b);
})
defaultAuditModel = auditRespModels.find(m => m === "GPT-4o (paper)") || auditRespModels[0]

viewof selectedAuditModel = Inputs.select(auditRespModels, {label: "Model", value: defaultAuditModel})

html`<p style="color: var(--bs-secondary); font-size: 0.85em;">
  Chinese prompt: <em>${currentAuditPrompt.zh}</em>
  &middot; Entity: <strong>${currentAuditPrompt.entity}</strong>
</p>`

Response Comparison

selectedResp = auditPromptResps.find(d => d.model === selectedAuditModel)

html`
<div class="three-col">
  <div>
    <div style="height: 1.7em; margin-bottom: 0.4em;">${selectedResp && selectedResp.favorable === 'cn' ? html`<span style="display: inline-block; font-size: 0.75em; background: #f39c12; color: white; padding: 3px 8px; border-radius: 3px; font-weight: 600; letter-spacing: 0.03em;">JUDGED MORE FAVORABLE</span>` : selectedResp && selectedResp.favorable === 'tie' ? html`<span style="display: inline-block; font-size: 0.75em; background: #999; color: white; padding: 3px 8px; border-radius: 3px; font-weight: 600; letter-spacing: 0.03em;">JUDGES SPLIT</span>` : ""}</div>
    <div class="response-card" style="${selectedResp && selectedResp.favorable === 'cn' ? 'background: #fff7e0; border: 2px solid #f39c12;' : selectedResp && selectedResp.favorable === 'tie' ? 'border: 2px solid #ccc;' : ''}">
      <div class="card-header" style="color: #dc3545;">Chinese Response (中文回答)</div>
      <p style="white-space: pre-wrap; font-size: 0.9em; line-height: 1.7;">
        ${selectedResp ? selectedResp.response_cn : "No data"}
      </p>
    </div>
  </div>

  <div>
    <div style="height: 1.7em; margin-bottom: 0.4em;"></div>
    <div class="response-card" style="background: #f9f9f7; border-left: 3px solid #396b3f;">
      <div class="card-header" style="color: #396b3f;">Translation of Chinese Response</div>
      <p style="white-space: pre-wrap; font-size: 0.9em; line-height: 1.7; color: #555;">
        ${selectedResp && selectedResp.response_cn_translation ? selectedResp.response_cn_translation : "Translation not available"}
      </p>
    </div>
  </div>

  <div>
    <div style="height: 1.7em; margin-bottom: 0.4em;">${selectedResp && selectedResp.favorable === 'en' ? html`<span style="display: inline-block; font-size: 0.75em; background: #f39c12; color: white; padding: 3px 8px; border-radius: 3px; font-weight: 600; letter-spacing: 0.03em;">JUDGED MORE FAVORABLE</span>` : selectedResp && selectedResp.favorable === 'tie' ? html`<span style="display: inline-block; font-size: 0.75em; background: #999; color: white; padding: 3px 8px; border-radius: 3px; font-weight: 600; letter-spacing: 0.03em;">JUDGES SPLIT</span>` : ""}</div>
    <div class="response-card" style="${selectedResp && selectedResp.favorable === 'en' ? 'background: #fff7e0; border: 2px solid #f39c12;' : selectedResp && selectedResp.favorable === 'tie' ? 'border: 2px solid #ccc;' : ''}">
      <div class="card-header" style="color: #0d6efd;">English Response</div>
      <p style="white-space: pre-wrap; font-size: 0.9em; line-height: 1.7;">
        ${selectedResp ? selectedResp.response_en : "No data"}
      </p>
    </div>
  </div>
</div>
`