summaryrefslogtreecommitdiffstats
path: root/www/invalidutf.html
diff options
context:
space:
mode:
Diffstat (limited to 'www/invalidutf.html')
-rw-r--r--www/invalidutf.html249
1 files changed, 249 insertions, 0 deletions
diff --git a/www/invalidutf.html b/www/invalidutf.html
new file mode 100644
index 0000000..0c6c2d3
--- /dev/null
+++ b/www/invalidutf.html
@@ -0,0 +1,249 @@
+<!DOCTYPE html>
+<html><head>
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<link href="sqlite.css" rel="stylesheet">
+<title>Invalid UTF Policy</title>
+<!-- path= -->
+</head>
+<body>
+<div class=nosearch>
+<a href="index.html">
+<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
+</a>
+<div><!-- IE hack to prevent disappearing logo --></div>
+<div class="tagline desktoponly">
+Small. Fast. Reliable.<br>Choose any three.
+</div>
+<div class="menu mainmenu">
+<ul>
+<li><a href="index.html">Home</a>
+<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
+<li class='wideonly'><a href='about.html'>About</a>
+<li class='desktoponly'><a href="docs.html">Documentation</a>
+<li class='desktoponly'><a href="download.html">Download</a>
+<li class='wideonly'><a href='copyright.html'>License</a>
+<li class='desktoponly'><a href="support.html">Support</a>
+<li class='desktoponly'><a href="prosupport.html">Purchase</a>
+<li class='search' id='search_menubutton'>
+<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
+</ul>
+</div>
+<div class="menu submenu" id="submenu">
+<ul>
+<li><a href='about.html'>About</a>
+<li><a href='docs.html'>Documentation</a>
+<li><a href='download.html'>Download</a>
+<li><a href='support.html'>Support</a>
+<li><a href='prosupport.html'>Purchase</a>
+</ul>
+</div>
+<div class="searchmenu" id="searchmenu">
+<form method="GET" action="search">
+<select name="s" id="searchtype">
+<option value="d">Search Documentation</option>
+<option value="c">Search Changelog</option>
+</select>
+<input type="text" name="q" id="searchbox" value="">
+<input type="submit" value="Go">
+</form>
+</div>
+</div>
+<script>
+function toggle_div(nm) {
+var w = document.getElementById(nm);
+if( w.style.display=="block" ){
+w.style.display = "none";
+}else{
+w.style.display = "block";
+}
+}
+function toggle_search() {
+var w = document.getElementById("searchmenu");
+if( w.style.display=="block" ){
+w.style.display = "none";
+} else {
+w.style.display = "block";
+setTimeout(function(){
+document.getElementById("searchbox").focus()
+}, 30);
+}
+}
+function div_off(nm){document.getElementById(nm).style.display="none";}
+window.onbeforeunload = function(e){div_off("submenu");}
+/* Disable the Search feature if we are not operating from CGI, since */
+/* Search is accomplished using CGI and will not work without it. */
+if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
+document.getElementById("search_menubutton").style.display = "none";
+}
+/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
+function hideorshow(btn,obj){
+var x = document.getElementById(obj);
+var b = document.getElementById(btn);
+if( x.style.display!='none' ){
+x.style.display = 'none';
+b.innerHTML='show';
+}else{
+x.style.display = '';
+b.innerHTML='hide';
+}
+return false;
+}
+var antiRobot = 0;
+function antiRobotGo(){
+if( antiRobot!=3 ) return;
+antiRobot = 7;
+var j = document.getElementById("mtimelink");
+if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
+}
+function antiRobotDefense(){
+document.body.onmousedown=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousedown=null;
+}
+document.body.onmousemove=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousemove=null;
+}
+setTimeout(function(){
+antiRobot |= 1;
+antiRobotGo();
+}, 100)
+antiRobotGo();
+}
+antiRobotDefense();
+</script>
+<div class=fancy>
+<div class=nosearch>
+<div class="fancy_title">
+Invalid UTF Policy
+</div>
+</div>
+
+
+
+
+
+<h1 id="garbage_in_garbage_out"><span>1. </span>Garbage In, Garbage Out</h1>
+
+<p>
+With regard to invalid UTF, SQLite follows a policy of
+Garbage-In, Garbage-Out (GIGO). If you insert invalid UTF
+into an SQLite database, then try to query that data, what you get back out
+might not be exactly what you put in. If you put garbage in, then you
+may not complain if you get different garbage back out.
+
+</p><p>
+For the purposes of this discussion, "invalid UTF" can mean any of
+the following circumstances:
+
+</p><ul>
+<li><p>
+Invalid surrogate pairs in UTF-16.
+
+</p></li><li><p>
+Invalid multi-byte sequences in UTF-8.
+
+</p></li><li><p>
+Using more bytes of UTF-8 than necessary to represent a single
+code point. (Example: encoding 'A' as the two-byte sequence
+0xc1, 0x01 instead of just a single 0x41 byte.)
+
+</p></li><li><p>
+NUL characters (U+0000) embedded in strings.
+
+</p></li><li><p>
+Invalid sequences of combining characters.
+
+</p></li><li><p>
+UTF-8 or UTF-16 bytes sequences that encode numbers that are not
+defined Unicode characters.
+
+</p></li></ul>
+
+<h2 id="invalid_utf_will_never_cause_memory_errors"><span>1.1. </span>Invalid UTF will never cause memory errors</h2>
+
+<p>
+If you insert invalid UTF into an SQLite database, then SQLite makes
+no guarantees about what text you might get back out. But it does
+promise that invalid UTF will never cause memory errors
+(array overruns, reads or writes of uninitialized memory, etc), at
+least for the built-in processing of SQLite.
+In other words, invalid UTF will not cause SQLite to crash.
+
+</p><p>
+This promise only applies to the core SQLite components, not
+application-provided extensions, of course.
+If an application adds new application-defined SQL functions or
+virtual tables or collating sequences or other extensions, and a
+database contains invalid UTF, then invalid UTF might get passed
+into those extensions. If the invalid UTF causes one of those
+extensions to crash, then that is a problem with the extension,
+not with SQLite.
+
+</p><h1 id="no_enforcement_of_text_formatting_rules"><span>2. </span>No enforcement of text formatting rules</h1>
+
+<p>
+SQLite does not try to enforce UTF formatting rules. You can
+insert invalid UTF into a TEXT field and SQLite will not complain
+about this. It stores the invalid TEXT as best it can. SQLite
+sees its role in the world as a storage engine, not a text format
+validation engine.
+
+</p><h1 id="best_effort_to_preserve_text"><span>3. </span>Best effort to preserve text</h1>
+
+<p>
+SQLite does not promise to always preserve invalid UTF, but it does
+make an effort. Generally speaking, if you insert invalid UTF into
+SQLite, you will get the exact same byte sequence back out, as long
+as you do not ask SQLite to transform the text in any way.
+
+</p><p>
+For example, if you insert some UTF-16LE with invalid surrogates into
+a TEXT column of a table of a database that has <a href="pragma.html#pragma_encoding">PRAGMA encoding=UTF16LE</a>,
+then later query that column using <a href="c3ref/column_blob.html">sqlite3_column_text16()</a>, you will
+probably get back the same exact invalid UTF-16. But if you insert the
+same invalid UTF-16LE content in a <a href="pragma.html#pragma_encoding">PRAGMA encoding=UTF8</a> database,
+the content must be converted into UTF8 when it is stored, which could
+cause irreversible changes to the content. Or if you insert that
+same invalid UTF-16LE content into a <a href="pragma.html#pragma_encoding">PRAGMA encoding=UTF16LE</a> database
+but then read it out using <a href="c3ref/column_blob.html">sqlite3_column_text()</a>, then a UTF16 to
+UTF8 conversion must occur during the read-out and that conversion might
+introduce irreversible changes.
+
+</p><p>
+Or, suppose you are doing everything using UTF-8 (the most common case).
+Invalid UTF-8 will normally pass through the database without any change
+in its byte sequence. However, if you try to transform the invalid
+UTF-8 with SQL function like <a href="lang_corefunc.html#substr">substr()</a> or <a href="lang_corefunc.html#replace">replace()</a>
+or if you try to do string matching with the <a href="lang_expr.html#like">LIKE</a> operator, then
+you might get unexpected results.
+
+</p><p>
+So, in other words, SQLite does not actively try to subvert your
+invalid text. But when you ask SQLite to make transformations of invalid
+UTF, there are no guarantees that those transformations will be reversible
+or even sensible.
+
+</p><h1 id="invalid_utf_in_the_database_schema"><span>4. </span>Invalid UTF in the database schema</h1>
+
+<p>
+If a database schema contains names (table names, column names, index
+names, and so forth) that are invalid UTF, SQLite will continue to
+operate normally. As far as SQLite is concerned, those names are just
+byte sequences. SQLite does not care whether they are valid UTF or not.
+
+</p><p>
+When generating error messages (using, for example, <a href="c3ref/errcode.html">sqlite3_errmsg()</a>),
+sometimes SQLite will embedded parts of the database schema into the
+error message. If those embedded schema elements
+are invalid UTF, then the resulting error message might also be
+invalid UTF.
+Similarly, the output from the <a href="pragma.html#pragma_integrity_check">PRAGMA integrity_check</a> and similar
+statements will sometimes embed the names of schema elements. If those
+schema element names are invalid UTF, then the output of the command
+will also be invalid UTF.
+</p><p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/invalidutf.in?m=33df79fd55">2023-12-05 14:43:20</a> UTC </small></i></p>
+