mithril-vndb/scripts/_lint-docs/decode-response.js
Claudia Meadows 0d095d1373
Rewrite docs linter
1. I want to set the stage to deal with #2898 properly.
2. `request` was deprecated years ago. Decided that it's better to just
   move to native Node.js APIs in its place.
3. `glob` was outdated, and it's easier to just toss it than to upgrade
   across a major version.
4. I switched to using Marked's "lexer" directly so I'm not fussing
   with the complexity of renderers. This of course necessitated a more
   complex file processor as its "lexer" is really an AST parser.

I also decided to go a few steps further:
- Drop the cache to simplify everything. I might reverse this later,
  but just caching URLs per-page should be enough to prevent the world
  from crashing down.
- Drop some more dependencies, so I don't have to come back to this
  later nearly as quickly.
- Upgrade to a more modern language version in the scripts.
- Update Marked. It was super outdated.
- Add line and column numbers to the warnings. That took quite a bit of
  work, thanks to a missing Marked feature plus a bug in Marked.
2024-09-23 04:54:17 -07:00

244 lines
6.3 KiB
JavaScript

// Disabling this globally as I use it a lot to speed up common operations and cut down on
// duplicate comparisons.
/* eslint-disable no-bitwise */
"use strict"
const win1252Map = [
0x20AC,
0x81,
0x201A,
0x0192,
0x201E,
0x2026,
0x2020,
0x2021,
0x02C6,
0x2030,
0x0160,
0x2039,
0x0152,
0x8D,
0x017D,
0x8F,
0x90,
0x2018,
0x2019,
0x201C,
0x201D,
0x2022,
0x2013,
0x2014,
0x02DC,
0x2122,
0x0161,
0x203A,
0x0153,
0x9D,
0x017E,
0x0178,
]
function decode(buffer, encoding) {
switch (encoding) {
case "utf16be":
buffer.swap16()
encoding = "utf16le"
break
case "win1252":
encoding = "latin1"
for (let i = 0; i < buffer.length; i++) {
const value = buffer[i]
if ((value & 0xE0) === 0x80) {
const u16 = new Uint16Array(buffer.length)
u16.set(buffer.subarray(0, i), 0)
for (; i < buffer.length; i++) {
const value = buffer[i]
const mask = -((value & 0xE0) === 0x80)
u16[i] = value & ~mask | win1252Map[value & 0x1F] & mask
}
buffer = Buffer.from(u16.buffer)
encoding = "utf16le"
break
}
}
break
}
return buffer.toString(encoding)
}
// Ref: https://encoding.spec.whatwg.org/#concept-encoding-get
/** @type {Array<["utf8" | "utf16le" | "utf16be" | "win1252", string]>} */
const encodingMap = [
["utf8", "UNICODE11UTF8"],
["utf8", "UNICODE20UTF8"],
["utf8", "UNICODE-1-1-UTF-8"],
["utf8", "UTF8"],
["utf8", "UTF-8"],
["utf8", "X-UNICODE20UTF8"],
["win1252", "ANSI_X3.4-1968"],
["win1252", "ASCII"],
["win1252", "CP1252"],
["win1252", "CP819"],
["win1252", "CSISOLATIN1"],
["win1252", "IBM819"],
["win1252", "ISO-8859-1"],
["win1252", "ISO-IR-100"],
["win1252", "ISO8859-1"],
["win1252", "ISO88591"],
["win1252", "ISO_8859-1"],
["win1252", "ISO_8859-1:1987"],
["win1252", "L1"],
["win1252", "LATIN1"],
["win1252", "US-ASCII"],
["win1252", "WINDOWS-1252"],
["win1252", "X-CP1252"],
["utf16be", "UNICODEFFFE"],
["utf16be", "UTF-16BE"],
["utf16le", "CSUNICODE"],
["utf16le", "ISO-10646-UCS-2"],
["utf16le", "UCS-2"],
["utf16le", "UNICODE"],
["utf16le", "UNICODEFEFF"],
["utf16le", "UTF-16"],
["utf16le", "UTF-16LE"],
]
function extractNamedEncoding(name) {
outer:
for (const entry of encodingMap) {
const expected = entry[1]
if (expected.length !== name.length) continue
for (let i = 0; i < name.length; i++) {
let ch = expected.charCodeAt(i)
const upper = ch & ~0x20
if (upper >= 0x41 && upper <= 0x5A) ch = upper
if (name.charCodeAt(i) !== expected) continue outer
}
return entry[0]
}
return undefined
}
function isAsciiWhitespace(ch) {
const mask = (
1 << (0x09 - 1) |
1 << (0x0A - 1) |
1 << (0x0C - 1) |
1 << (0x0D - 1) |
1 << (0x20 - 1)
)
ch |= 0
return ch < 0x20 && (mask >>> (ch - 1) & 1) !== 0
}
function startsWith(buffer, i, end, sequence) {
if (buffer.length < i + sequence.length) return false
for (let j = 0; j < sequence.length && i < end; i++, j++) {
let ch = sequence.charCodeAt(j)
if (ch === 0x20) {
if (!isAsciiWhitespace(buffer[i++])) return false
while (i < buffer.length && isAsciiWhitespace(buffer[i])) i++
} else {
const upper = ch & ~0x20
if (upper >= 0x41 && upper <= 0x5A) ch = upper
if (ch !== buffer[i]) return false
}
}
return true
}
const metasToCheck = encodingMap.flatMap(([e, n]) => [
[e, `charset=${n}>`],
[e, `charset="${n}">`],
[e, `charset='${n}'>`],
[e, `charset=${n}/>`],
[e, `charset="${n}"/>`],
[e, `charset='${n}'/>`],
[e, `http-equiv=content-type content=${n}>`],
[e, `http-equiv="content-type" content=${n}>`],
[e, `http-equiv='content-type' content=${n}>`],
[e, `http-equiv=content-type content="${n}">`],
[e, `http-equiv="content-type" content="${n}">`],
[e, `http-equiv='content-type' content="${n}">`],
[e, `http-equiv=content-type content='${n}'>`],
[e, `http-equiv="content-type" content='${n}'>`],
[e, `http-equiv='content-type' content='${n}'>`],
[e, `http-equiv=content-type content=${n}/>`],
[e, `http-equiv="content-type" content=${n}/>`],
[e, `http-equiv='content-type' content=${n}/>`],
[e, `http-equiv=content-type content="${n}"/>`],
[e, `http-equiv="content-type" content="${n}"/>`],
[e, `http-equiv='content-type' content="${n}"/>`],
[e, `http-equiv=content-type content='${n}'/>`],
[e, `http-equiv="content-type" content='${n}'/>`],
[e, `http-equiv='content-type' content='${n}'/>`],
])
function extractMetaEncoding(buffer, i, end) {
// Exceptionally lazy and not quite fully correct
for (const [encoding, meta] of metasToCheck) {
if (startsWith(buffer, i, end, meta)) return encoding
}
return undefined
}
/**
* @returns {"utf8" | "utf16le" | "utf16be" | "win1252"}
*/
function detectEncoding(headers, prefix) {
// This follows the HTML spec to the extent Node supports the various encodings. I'm *not*,
// however, going to bend over backwards to support obscure encodings.
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
if (startsWith(prefix, 0, prefix.length, "\xEF\xBB\xBF")) return "utf8"
if (startsWith(prefix, 0, prefix.length, "\xFE\xFF")) return "utf16le"
if (startsWith(prefix, 0, prefix.length, "\xFF\xFE")) return "utf16be"
const contentType = headers["content-type"]
if (contentType) {
const result = (/;\s*charset="?([\w-]+)"?/i).exec(contentType)
if (result) {
const encoding = extractNamedEncoding(result[1])
if (encoding) return encoding
}
}
if (startsWith(prefix, 0, prefix.length, "\x3c\x00\x3F\x00\x78\x00")) return "utf16le"
if (startsWith(prefix, 0, prefix.length, "\x00\x3c\x00\x3F\x00\x78")) return "utf16be"
for (let i = 0, end = prefix.indexOf("<!--", 0, "latin1"); i < prefix.length;) {
if (i === end) {
i = prefix.indexOf("-->", i + 4, "latin1")
if (i < 0) return undefined
i += 3
end = prefix.indexOf("<!--", i, "latin1")
} else if (prefix[i] === 0x3C) {
i++
if (i === prefix.length) return "win1252"
if (startsWith(prefix, i, end, "meta ")) {
const encoding = extractMetaEncoding(prefix, i, end)
if (encoding) return encoding
} else if (prefix[i] === 0x21 || prefix[i] === 0x2F || prefix[i] === 0x3F) {
i = prefix.indexOf(0x3E, i)
if (i < 0) return "win1252"
i++
}
}
}
return "win1252"
}
function decodeResponse(headers, body) {
return decode(body, detectEncoding(headers, body.subarray(0, 1024)))
}
module.exports = {
decodeResponse,
}