mirror of
https://github.com/rustfs/rustfs.git
synced 2026-03-17 14:24:08 +00:00
feat(s3select): add JSON handling and flattening for EcObjectStore (#1930)
Signed-off-by: 0xdx2 <xuedamon2@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: houseme <housemecn@gmail.com>
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -7446,6 +7446,7 @@ dependencies = [
|
||||
"rustfs-common",
|
||||
"rustfs-ecstore",
|
||||
"s3s",
|
||||
"serde_json",
|
||||
"snafu 0.8.9",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
|
||||
@@ -38,6 +38,7 @@ http.workspace = true
|
||||
object_store = { workspace = true }
|
||||
pin-project-lite.workspace = true
|
||||
s3s.workspace = true
|
||||
serde_json = { workspace = true }
|
||||
snafu = { workspace = true, features = ["backtrace"] }
|
||||
parking_lot.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -40,15 +40,39 @@ use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use std::task::ready;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::info;
|
||||
use transform_stream::AsyncTryStream;
|
||||
|
||||
/// Maximum allowed object size for JSON DOCUMENT mode.
|
||||
///
|
||||
/// JSON DOCUMENT format requires loading the entire file into memory for DOM
|
||||
/// parsing, so memory consumption grows linearly with file size. Objects
|
||||
/// larger than this threshold are rejected with an error rather than risking
|
||||
/// an OOM condition.
|
||||
///
|
||||
/// To process larger JSON files, convert the input to **JSON LINES** (NDJSON,
|
||||
/// `type = LINES`), which supports line-by-line streaming with no memory
|
||||
/// size limit.
|
||||
///
|
||||
/// Default: 128 MiB. This matches the AWS S3 Select limit for JSON DOCUMENT
|
||||
/// inputs.
|
||||
pub const MAX_JSON_DOCUMENT_BYTES: u64 = 128 * 1024 * 1024;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct EcObjectStore {
|
||||
input: Arc<SelectObjectContentInput>,
|
||||
need_convert: bool,
|
||||
delimiter: String,
|
||||
/// True when the JSON input type is DOCUMENT (multi-line formatted JSON).
|
||||
/// In that case the raw bytes are buffered and flattened to NDJSON before
|
||||
/// being handed to DataFusion's Arrow JSON reader.
|
||||
is_json_document: bool,
|
||||
/// Optional JSON sub-path extracted from `FROM s3object.<path>` in the SQL
|
||||
/// expression. When set, `flatten_json_document_to_ndjson` navigates to
|
||||
/// this key in the root JSON object before flattening.
|
||||
json_sub_path: Option<String>,
|
||||
|
||||
store: Arc<ECStore>,
|
||||
}
|
||||
@@ -72,10 +96,31 @@ impl EcObjectStore {
|
||||
(false, String::new())
|
||||
};
|
||||
|
||||
// Detect JSON DOCUMENT type: the entire file is a single (possibly
|
||||
// multi-line) JSON object/array, NOT newline-delimited JSON.
|
||||
let is_json_document = input
|
||||
.request
|
||||
.input_serialization
|
||||
.json
|
||||
.as_ref()
|
||||
.and_then(|j| j.type_.as_ref())
|
||||
.map(|t| t.as_str() == "DOCUMENT")
|
||||
.unwrap_or(false);
|
||||
|
||||
// Extract the JSON sub-path from the SQL expression, e.g.
|
||||
// `SELECT … FROM s3object.employees e` → `Some("employees")`.
|
||||
let json_sub_path = if is_json_document {
|
||||
extract_json_sub_path_from_expression(&input.request.expression)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
input,
|
||||
need_convert,
|
||||
delimiter,
|
||||
is_json_document,
|
||||
json_sub_path,
|
||||
store,
|
||||
})
|
||||
}
|
||||
@@ -110,39 +155,66 @@ impl ObjectStore for EcObjectStore {
|
||||
source: "can not get object info".into(),
|
||||
})?;
|
||||
|
||||
let original_size = reader.object_info.size as u64;
|
||||
let etag = reader.object_info.etag;
|
||||
let attributes = Attributes::default();
|
||||
|
||||
let (payload, size) = if self.is_json_document {
|
||||
// JSON DOCUMENT mode: gate on object size before doing any I/O.
|
||||
//
|
||||
// Small files (<= MAX_JSON_DOCUMENT_BYTES): build a lazy stream
|
||||
// that defers all I/O and JSON parsing until DataFusion first
|
||||
// polls it. Parsing runs inside spawn_blocking so the async
|
||||
// runtime thread is never blocked.
|
||||
//
|
||||
// Large files (> MAX_JSON_DOCUMENT_BYTES): return an error
|
||||
// immediately. JSON DOCUMENT relies on serde_json DOM parsing
|
||||
// which must load the whole file into memory; rejecting oversized
|
||||
// files upfront is safer than risking OOM. Users should convert
|
||||
// their data to JSON LINES (NDJSON) format for large files.
|
||||
if original_size > MAX_JSON_DOCUMENT_BYTES {
|
||||
return Err(o_Error::Generic {
|
||||
store: "EcObjectStore",
|
||||
source: format!(
|
||||
"JSON DOCUMENT object is {original_size} bytes, which exceeds the \
|
||||
maximum allowed size of {MAX_JSON_DOCUMENT_BYTES} bytes \
|
||||
({} MiB). Convert the input to JSON LINES (NDJSON) to process \
|
||||
large files.",
|
||||
MAX_JSON_DOCUMENT_BYTES / (1024 * 1024)
|
||||
)
|
||||
.into(),
|
||||
});
|
||||
}
|
||||
let stream = json_document_ndjson_stream(reader.stream, original_size, self.json_sub_path.clone());
|
||||
(object_store::GetResultPayload::Stream(stream), original_size)
|
||||
} else if self.need_convert {
|
||||
let stream = bytes_stream(
|
||||
ReaderStream::with_capacity(ConvertStream::new(reader.stream, self.delimiter.clone()), DEFAULT_READ_BUFFER_SIZE),
|
||||
original_size as usize,
|
||||
)
|
||||
.boxed();
|
||||
(object_store::GetResultPayload::Stream(stream), original_size)
|
||||
} else {
|
||||
let stream = bytes_stream(
|
||||
ReaderStream::with_capacity(reader.stream, DEFAULT_READ_BUFFER_SIZE),
|
||||
original_size as usize,
|
||||
)
|
||||
.boxed();
|
||||
(object_store::GetResultPayload::Stream(stream), original_size)
|
||||
};
|
||||
|
||||
let meta = ObjectMeta {
|
||||
location: location.clone(),
|
||||
last_modified: Utc::now(),
|
||||
size: reader.object_info.size as u64,
|
||||
e_tag: reader.object_info.etag,
|
||||
size,
|
||||
e_tag: etag,
|
||||
version: None,
|
||||
};
|
||||
let attributes = Attributes::default();
|
||||
|
||||
let payload = if self.need_convert {
|
||||
object_store::GetResultPayload::Stream(
|
||||
bytes_stream(
|
||||
ReaderStream::with_capacity(
|
||||
ConvertStream::new(reader.stream, self.delimiter.clone()),
|
||||
DEFAULT_READ_BUFFER_SIZE,
|
||||
),
|
||||
reader.object_info.size as usize,
|
||||
)
|
||||
.boxed(),
|
||||
)
|
||||
} else {
|
||||
object_store::GetResultPayload::Stream(
|
||||
bytes_stream(
|
||||
ReaderStream::with_capacity(reader.stream, DEFAULT_READ_BUFFER_SIZE),
|
||||
reader.object_info.size as usize,
|
||||
)
|
||||
.boxed(),
|
||||
)
|
||||
};
|
||||
Ok(GetResult {
|
||||
payload,
|
||||
meta,
|
||||
range: 0..reader.object_info.size as u64,
|
||||
range: 0..size,
|
||||
attributes,
|
||||
})
|
||||
}
|
||||
@@ -241,6 +313,193 @@ fn replace_symbol(delimiter: &[u8], slice: &[u8]) -> Vec<u8> {
|
||||
result
|
||||
}
|
||||
|
||||
/// Extract the JSON sub-path from a SQL expression's FROM clause.
|
||||
///
|
||||
/// Given `SELECT e.name FROM s3object.employees e WHERE …` this returns
|
||||
/// `Some("employees")`. Returns `None` when the FROM target is plain
|
||||
/// `s3object` (no sub-path) or when the expression cannot be parsed.
|
||||
fn extract_json_sub_path_from_expression(expression: &str) -> Option<String> {
|
||||
// Find " FROM " (case-insensitive).
|
||||
let lower = expression.to_lowercase();
|
||||
let from_pos = lower.find(" from ")?;
|
||||
let after_from = expression[from_pos + 6..].trim_start();
|
||||
|
||||
// Must start with "s3object" (case-insensitive, ASCII-only for the prefix).
|
||||
const S3OBJECT_LOWER: &str = "s3object";
|
||||
let mut chars = after_from.char_indices();
|
||||
for expected in S3OBJECT_LOWER.chars() {
|
||||
let (idx, actual) = chars.next()?;
|
||||
if actual.to_ascii_lowercase() != expected {
|
||||
return None;
|
||||
}
|
||||
// When we have consumed the full prefix, `idx` is the byte index of
|
||||
// the current character; use it plus its UTF-8 length as the slice
|
||||
// boundary for the remaining string.
|
||||
if expected == 't' {
|
||||
let end_of_prefix = idx + actual.len_utf8();
|
||||
let after_s3object = &after_from[end_of_prefix..];
|
||||
|
||||
// If the very next character is '.' there is a sub-path.
|
||||
if let Some(rest) = after_s3object.strip_prefix('.') {
|
||||
let rest = rest.trim_start();
|
||||
if rest.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Support quoted identifiers: s3object."my.path" or s3object.'my path'
|
||||
let mut chars = rest.chars();
|
||||
if let Some(first) = chars.next()
|
||||
&& (first == '"' || first == '\'')
|
||||
{
|
||||
let quote = first;
|
||||
let inner = &rest[first.len_utf8()..];
|
||||
if let Some(end) = inner.find(quote) {
|
||||
let path = &inner[..end];
|
||||
if !path.trim().is_empty() {
|
||||
return Some(path.to_string());
|
||||
}
|
||||
}
|
||||
// Quoted but no closing quote or empty: treat as no sub-path.
|
||||
return None;
|
||||
}
|
||||
|
||||
// Unquoted identifier: collect characters until whitespace, '[', or ']'.
|
||||
let end = rest
|
||||
.find(|c: char| c.is_whitespace() || c == '[' || c == ']')
|
||||
.unwrap_or(rest.len());
|
||||
let path = rest[..end].trim();
|
||||
if !path.is_empty() {
|
||||
return Some(path.to_string());
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// We only reach here if the loop completed without hitting the 't'
|
||||
// branch above, which would be unexpected given S3OBJECT_LOWER.
|
||||
None
|
||||
}
|
||||
|
||||
/// Build a lazy NDJSON stream from a JSON DOCUMENT reader.
|
||||
///
|
||||
/// `get_opts` calls this and returns immediately – no I/O is performed until
|
||||
/// DataFusion begins polling the returned stream. The pipeline is:
|
||||
///
|
||||
/// 1. **Read** – the object bytes are read asynchronously from `stream` only
|
||||
/// when the returned stream is first polled.
|
||||
/// 2. **Parse** – JSON deserialization runs inside
|
||||
/// `tokio::task::spawn_blocking` so the async runtime is never blocked by
|
||||
/// CPU-bound work, even for very large documents.
|
||||
/// 3. **Yield** – each NDJSON line (one per array element, or one line for a
|
||||
/// scalar/object root) is yielded as a separate [`Bytes`] chunk, so
|
||||
/// DataFusion can pipeline row processing as lines arrive.
|
||||
fn json_document_ndjson_stream(
|
||||
stream: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync>,
|
||||
original_size: u64,
|
||||
json_sub_path: Option<String>,
|
||||
) -> futures_core::stream::BoxStream<'static, Result<Bytes>> {
|
||||
AsyncTryStream::<Bytes, o_Error, _>::new(|mut y| async move {
|
||||
pin_mut!(stream);
|
||||
// ── 1. Read phase (lazy: only runs when the stream is polled) ────
|
||||
let mut all_bytes = Vec::with_capacity(original_size as usize);
|
||||
stream
|
||||
.take(original_size)
|
||||
.read_to_end(&mut all_bytes)
|
||||
.await
|
||||
.map_err(|e| o_Error::Generic {
|
||||
store: "EcObjectStore",
|
||||
source: Box::new(e),
|
||||
})?;
|
||||
|
||||
// ── 2. Parse phase (blocking thread pool, non-blocking runtime) ──
|
||||
let lines = tokio::task::spawn_blocking(move || parse_json_document_to_lines(&all_bytes, json_sub_path.as_deref()))
|
||||
.await
|
||||
.map_err(|e| o_Error::Generic {
|
||||
store: "EcObjectStore",
|
||||
source: e.to_string().into(),
|
||||
})?
|
||||
.map_err(|e| o_Error::Generic {
|
||||
store: "EcObjectStore",
|
||||
source: Box::new(e),
|
||||
})?;
|
||||
|
||||
// ── 3. Yield phase (one Bytes per NDJSON line) ───────────────────
|
||||
for line in lines {
|
||||
y.yield_ok(line).await;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
/// Parse a JSON DOCUMENT (a single JSON value, possibly multi-line) into a
|
||||
/// list of NDJSON lines – one [`Bytes`] per record.
|
||||
///
|
||||
/// `json_sub_path` – when the SQL expression contains `FROM s3object.<key>`,
|
||||
/// pass `Some(key)` to navigate into that key before flattening. For
|
||||
/// example, given `{"employees":[{…},{…}]}` and `json_sub_path =
|
||||
/// Some("employees")`, each element of the `employees` array becomes one
|
||||
/// NDJSON line.
|
||||
///
|
||||
/// - A JSON array → one line per element.
|
||||
/// - A JSON object (no sub-path match, or scalar root) → one line.
|
||||
fn parse_json_document_to_lines(bytes: &[u8], json_sub_path: Option<&str>) -> std::io::Result<Vec<Bytes>> {
|
||||
let root: serde_json::Value =
|
||||
serde_json::from_slice(bytes).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
|
||||
|
||||
// Navigate into the sub-path when the root is an object and a path was
|
||||
// extracted from the SQL FROM clause (e.g. `FROM s3object.employees`).
|
||||
let value = if let Some(path) = json_sub_path {
|
||||
if let serde_json::Value::Object(ref obj) = root {
|
||||
match obj.get(path) {
|
||||
Some(sub) => sub.clone(),
|
||||
// Path not found – fall back to emitting the whole root object.
|
||||
None => root,
|
||||
}
|
||||
} else {
|
||||
// Root is already an array or scalar; ignore the path hint.
|
||||
root
|
||||
}
|
||||
} else {
|
||||
root
|
||||
};
|
||||
|
||||
let mut lines: Vec<Bytes> = Vec::new();
|
||||
match value {
|
||||
serde_json::Value::Array(arr) => {
|
||||
for item in arr {
|
||||
let mut line = serde_json::to_vec(&item).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
|
||||
line.push(b'\n');
|
||||
lines.push(Bytes::from(line));
|
||||
}
|
||||
}
|
||||
other => {
|
||||
let mut line = serde_json::to_vec(&other).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
|
||||
line.push(b'\n');
|
||||
lines.push(Bytes::from(line));
|
||||
}
|
||||
}
|
||||
Ok(lines)
|
||||
}
|
||||
|
||||
/// Convert a JSON DOCUMENT to a single concatenated NDJSON [`Bytes`] blob.
|
||||
///
|
||||
/// This is a convenience wrapper around [`parse_json_document_to_lines`] used
|
||||
/// by the unit tests. Production code uses `json_document_ndjson_stream`
|
||||
/// instead, which streams lines lazily without constructing this intermediate
|
||||
/// blob.
|
||||
#[cfg(test)]
|
||||
fn flatten_json_document_to_ndjson(bytes: &[u8], json_sub_path: Option<&str>) -> std::io::Result<Bytes> {
|
||||
let lines = parse_json_document_to_lines(bytes, json_sub_path)?;
|
||||
let total = lines.iter().map(|b| b.len()).sum();
|
||||
let mut output = Vec::with_capacity(total);
|
||||
for line in lines {
|
||||
output.extend_from_slice(&line);
|
||||
}
|
||||
Ok(Bytes::from(output))
|
||||
}
|
||||
|
||||
pub fn bytes_stream<S>(stream: S, content_length: usize) -> impl Stream<Item = Result<Bytes>> + Send + 'static
|
||||
where
|
||||
S: Stream<Item = Result<Bytes, std::io::Error>> + Send + 'static,
|
||||
@@ -265,7 +524,7 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::replace_symbol;
|
||||
use super::{extract_json_sub_path_from_expression, flatten_json_document_to_ndjson, replace_symbol};
|
||||
|
||||
#[test]
|
||||
fn test_replace() {
|
||||
@@ -279,4 +538,163 @@ mod test {
|
||||
Err(e) => eprintln!("Error converting to string: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// A JSON array is split into one NDJSON line per element.
|
||||
#[test]
|
||||
fn test_flatten_array_produces_one_line_per_element() {
|
||||
let input = br#"[{"id":1,"name":"Alice"},{"id":2,"name":"Bob"}]"#;
|
||||
let result = flatten_json_document_to_ndjson(input, None).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
assert_eq!(lines.len(), 2);
|
||||
// Each line must be valid JSON
|
||||
for line in &lines {
|
||||
serde_json::from_str::<serde_json::Value>(line).expect("each line must be valid JSON");
|
||||
}
|
||||
// Spot-check field values
|
||||
let first: serde_json::Value = serde_json::from_str(lines[0]).unwrap();
|
||||
assert_eq!(first["id"], 1);
|
||||
assert_eq!(first["name"], "Alice");
|
||||
}
|
||||
|
||||
/// A single JSON object emits exactly one NDJSON line.
|
||||
#[test]
|
||||
fn test_flatten_single_object_produces_one_line() {
|
||||
let input = br#"{"id":42,"value":"hello world"}"#;
|
||||
let result = flatten_json_document_to_ndjson(input, None).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
assert_eq!(lines.len(), 1);
|
||||
let parsed: serde_json::Value = serde_json::from_str(lines[0]).unwrap();
|
||||
assert_eq!(parsed["id"], 42);
|
||||
assert_eq!(parsed["value"], "hello world");
|
||||
}
|
||||
|
||||
/// An empty JSON array produces empty output (zero bytes).
|
||||
#[test]
|
||||
fn test_flatten_empty_array_produces_no_output() {
|
||||
let input = b"[]";
|
||||
let result = flatten_json_document_to_ndjson(input, None).expect("should succeed");
|
||||
assert!(result.is_empty(), "empty array should yield zero bytes");
|
||||
}
|
||||
|
||||
/// A multi-line (pretty-printed) JSON document is flattened correctly.
|
||||
#[test]
|
||||
fn test_flatten_pretty_printed_document() {
|
||||
let input = b"[\n {\"a\": 1},\n {\"a\": 2},\n {\"a\": 3}\n]";
|
||||
let result = flatten_json_document_to_ndjson(input, None).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
assert_eq!(text.lines().count(), 3);
|
||||
}
|
||||
|
||||
/// Nested objects inside array elements are preserved as compact single-line JSON.
|
||||
#[test]
|
||||
fn test_flatten_array_with_nested_objects() {
|
||||
let input = br#"[{"outer":{"inner":99}},{"outer":{"inner":100}}]"#;
|
||||
let result = flatten_json_document_to_ndjson(input, None).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
assert_eq!(lines.len(), 2);
|
||||
// Each line must not contain a newline mid-value
|
||||
for line in &lines {
|
||||
assert!(!line.is_empty());
|
||||
let v: serde_json::Value = serde_json::from_str(line).unwrap();
|
||||
assert!(v["outer"]["inner"].as_i64().unwrap() >= 99);
|
||||
}
|
||||
}
|
||||
|
||||
/// Each output line ends with exactly one newline (no blank lines between records).
|
||||
#[test]
|
||||
fn test_flatten_output_ends_with_newline_per_record() {
|
||||
let input = br#"[{"x":1},{"x":2}]"#;
|
||||
let result = flatten_json_document_to_ndjson(input, None).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
// Exactly 2 newlines for 2 records
|
||||
assert_eq!(text.chars().filter(|&c| c == '\n').count(), 2);
|
||||
// No leading blank line
|
||||
assert!(!text.starts_with('\n'));
|
||||
}
|
||||
|
||||
/// Invalid JSON returns an `InvalidData` IO error.
|
||||
#[test]
|
||||
fn test_flatten_invalid_json_returns_error() {
|
||||
let input = b"{ not valid json }";
|
||||
let err = flatten_json_document_to_ndjson(input, None).expect_err("should fail on invalid JSON");
|
||||
assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
|
||||
}
|
||||
|
||||
/// Completely empty input returns an error (not valid JSON).
|
||||
#[test]
|
||||
fn test_flatten_empty_input_returns_error() {
|
||||
let err = flatten_json_document_to_ndjson(b"", None).expect_err("empty bytes are not valid JSON");
|
||||
assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
|
||||
}
|
||||
|
||||
// ── sub-path navigation tests ─────────────────────────────────────────
|
||||
|
||||
/// `FROM s3object.employees` with a root JSON object navigates into the
|
||||
/// `employees` array and emits one NDJSON line per element.
|
||||
#[test]
|
||||
fn test_flatten_sub_path_object_with_array() {
|
||||
let input = br#"{"employees":[{"id":1,"name":"Alice","salary":75000},{"id":2,"name":"Bob","salary":65000}]}"#;
|
||||
let result = flatten_json_document_to_ndjson(input, Some("employees")).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
assert_eq!(lines.len(), 2, "each employee should be its own NDJSON line");
|
||||
let first: serde_json::Value = serde_json::from_str(lines[0]).unwrap();
|
||||
assert_eq!(first["name"], "Alice");
|
||||
assert_eq!(first["salary"], 75000);
|
||||
let second: serde_json::Value = serde_json::from_str(lines[1]).unwrap();
|
||||
assert_eq!(second["name"], "Bob");
|
||||
}
|
||||
|
||||
/// Sub-path that does not exist in the root object falls back to emitting the
|
||||
/// entire root object as one NDJSON line (graceful degradation).
|
||||
#[test]
|
||||
fn test_flatten_sub_path_missing_key_falls_back() {
|
||||
let input = br#"{"employees":[]}"#;
|
||||
let result = flatten_json_document_to_ndjson(input, Some("nonexistent")).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
// Falls back to emitting the whole root object.
|
||||
assert_eq!(text.lines().count(), 1);
|
||||
let parsed: serde_json::Value = serde_json::from_str(text.trim_end()).unwrap();
|
||||
assert!(parsed.get("employees").is_some(), "root object preserved");
|
||||
}
|
||||
|
||||
/// Sub-path is ignored when the root is already an array.
|
||||
#[test]
|
||||
fn test_flatten_sub_path_ignored_for_root_array() {
|
||||
let input = br#"[{"id":1},{"id":2}]"#;
|
||||
let result = flatten_json_document_to_ndjson(input, Some("employees")).expect("should succeed");
|
||||
let text = std::str::from_utf8(&result).unwrap();
|
||||
// The root array is flattened directly regardless of the sub-path hint.
|
||||
assert_eq!(text.lines().count(), 2);
|
||||
}
|
||||
|
||||
// ── SQL path extraction tests ─────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_extract_json_sub_path_basic() {
|
||||
let sql = "SELECT e.name FROM s3object.employees e WHERE e.salary > 70000";
|
||||
assert_eq!(extract_json_sub_path_from_expression(sql), Some("employees".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_json_sub_path_uppercase() {
|
||||
let sql = "SELECT s.name FROM S3Object.records s";
|
||||
assert_eq!(extract_json_sub_path_from_expression(sql), Some("records".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_json_sub_path_no_sub_path() {
|
||||
let sql = "SELECT * FROM s3object WHERE s3object.age > 30";
|
||||
assert_eq!(extract_json_sub_path_from_expression(sql), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_json_sub_path_with_bracket() {
|
||||
// `FROM s3object.employees[*]` — bracket stops path collection.
|
||||
let sql = "SELECT e.name FROM s3object.employees[*] e";
|
||||
assert_eq!(extract_json_sub_path_from_expression(sql), Some("employees".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ use crate::query::Context;
|
||||
use crate::{QueryError, QueryResult, object_store::EcObjectStore};
|
||||
use datafusion::{
|
||||
execution::{SessionStateBuilder, context::SessionState, runtime_env::RuntimeEnvBuilder},
|
||||
parquet::data_type::AsBytes,
|
||||
prelude::SessionContext,
|
||||
};
|
||||
use object_store::{ObjectStore, memory::InMemory, path::Path};
|
||||
@@ -65,30 +64,36 @@ impl SessionCtxFactory {
|
||||
|
||||
let df_session_state = if self.is_test {
|
||||
let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
|
||||
let data = b"id,name,age,department,salary
|
||||
1,Alice,25,HR,5000
|
||||
2,Bob,30,IT,6000
|
||||
3,Charlie,35,Finance,7000
|
||||
4,Diana,22,Marketing,4500
|
||||
5,Eve,28,IT,5500
|
||||
6,Frank,40,Finance,8000
|
||||
7,Grace,26,HR,5200
|
||||
8,Henry,32,IT,6200
|
||||
9,Ivy,24,Marketing,4800
|
||||
10,Jack,38,Finance,7500";
|
||||
let data_bytes = data.as_bytes();
|
||||
// let data = r#""year"╦"gender"╦"ethnicity"╦"firstname"╦"count"╦"rank"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"SOPHIA"╦"119"╦"1"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"CHLOE"╦"106"╦"2"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"EMILY"╦"93"╦"3"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"OLIVIA"╦"89"╦"4"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"EMMA"╦"75"╦"5"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"ISABELLA"╦"67"╦"6"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"TIFFANY"╦"54"╦"7"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"ASHLEY"╦"52"╦"8"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"FIONA"╦"48"╦"9"
|
||||
// "2011"╦"FEMALE"╦"ASIAN AND PACIFIC ISLANDER"╦"ANGELA"╦"47"╦"10""#;
|
||||
// let data_bytes = Bytes::from(data);
|
||||
|
||||
// Choose test data format based on what the request serialization specifies.
|
||||
let data_bytes: &[u8] = if context.input.request.input_serialization.json.is_some() {
|
||||
// NDJSON: one JSON object per line — usable for both LINES and DOCUMENT
|
||||
// requests (DOCUMENT inputs are converted to NDJSON by EcObjectStore, but
|
||||
// in test mode we bypass EcObjectStore, so we put NDJSON here directly).
|
||||
b"{\"id\":1,\"name\":\"Alice\",\"age\":25,\"department\":\"HR\",\"salary\":5000}\n\
|
||||
{\"id\":2,\"name\":\"Bob\",\"age\":30,\"department\":\"IT\",\"salary\":6000}\n\
|
||||
{\"id\":3,\"name\":\"Charlie\",\"age\":35,\"department\":\"Finance\",\"salary\":7000}\n\
|
||||
{\"id\":4,\"name\":\"Diana\",\"age\":22,\"department\":\"Marketing\",\"salary\":4500}\n\
|
||||
{\"id\":5,\"name\":\"Eve\",\"age\":28,\"department\":\"IT\",\"salary\":5500}\n\
|
||||
{\"id\":6,\"name\":\"Frank\",\"age\":40,\"department\":\"Finance\",\"salary\":8000}\n\
|
||||
{\"id\":7,\"name\":\"Grace\",\"age\":26,\"department\":\"HR\",\"salary\":5200}\n\
|
||||
{\"id\":8,\"name\":\"Henry\",\"age\":32,\"department\":\"IT\",\"salary\":6200}\n\
|
||||
{\"id\":9,\"name\":\"Ivy\",\"age\":24,\"department\":\"Marketing\",\"salary\":4800}\n\
|
||||
{\"id\":10,\"name\":\"Jack\",\"age\":38,\"department\":\"Finance\",\"salary\":7500}\n"
|
||||
} else {
|
||||
b"id,name,age,department,salary
|
||||
1,Alice,25,HR,5000
|
||||
2,Bob,30,IT,6000
|
||||
3,Charlie,35,Finance,7000
|
||||
4,Diana,22,Marketing,4500
|
||||
5,Eve,28,IT,5500
|
||||
6,Frank,40,Finance,8000
|
||||
7,Grace,26,HR,5200
|
||||
8,Henry,32,IT,6200
|
||||
9,Ivy,24,Marketing,4800
|
||||
10,Jack,38,Finance,7500"
|
||||
};
|
||||
|
||||
let path = Path::from(context.input.key.clone());
|
||||
store.put(&path, data_bytes.into()).await.map_err(|e| {
|
||||
error!("put data into memory failed: {}", e.to_string());
|
||||
@@ -97,7 +102,7 @@ impl SessionCtxFactory {
|
||||
|
||||
df_session_state.with_object_store(&store_url, Arc::new(store)).build()
|
||||
} else {
|
||||
let store =
|
||||
let store: EcObjectStore =
|
||||
EcObjectStore::new(context.input.clone()).map_err(|_| QueryError::NotImplemented { err: String::new() })?;
|
||||
df_session_state.with_object_store(&store_url, Arc::new(store)).build()
|
||||
};
|
||||
|
||||
@@ -218,7 +218,16 @@ impl SimpleQueryDispatcher {
|
||||
(ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet"), false, false)
|
||||
} else if self.input.request.input_serialization.json.is_some() {
|
||||
let file_format = JsonFormat::default();
|
||||
(ListingOptions::new(Arc::new(file_format)).with_file_extension(".json"), false, false)
|
||||
// Use the actual file extension from the object key so that files stored
|
||||
// with a `.jsonl` suffix (newline-delimited JSON) are also matched by
|
||||
// DataFusion's listing/schema-inference logic. Falling back to ".json"
|
||||
// preserves behaviour for keys that have no extension.
|
||||
let file_ext = std::path::Path::new(&self.input.key)
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| format!(".{e}"))
|
||||
.unwrap_or_else(|| ".json".to_string());
|
||||
(ListingOptions::new(Arc::new(file_format)).with_file_extension(file_ext), false, false)
|
||||
} else {
|
||||
return Err(QueryError::NotImplemented {
|
||||
err: "not support this file type".to_string(),
|
||||
|
||||
@@ -20,8 +20,8 @@ mod integration_tests {
|
||||
query::{Context, Query},
|
||||
};
|
||||
use s3s::dto::{
|
||||
CSVInput, CSVOutput, ExpressionType, FileHeaderInfo, InputSerialization, OutputSerialization, SelectObjectContentInput,
|
||||
SelectObjectContentRequest,
|
||||
CSVInput, CSVOutput, ExpressionType, FileHeaderInfo, InputSerialization, JSONInput, JSONOutput, JSONType,
|
||||
OutputSerialization, SelectObjectContentInput, SelectObjectContentRequest,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -53,6 +53,36 @@ mod integration_tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a `SelectObjectContentInput` targeting a JSON DOCUMENT file.
|
||||
/// Uses `JSONType::DOCUMENT` so the NDJSON-flattening path in
|
||||
/// `EcObjectStore` is exercised.
|
||||
fn create_test_json_input(sql: &str) -> SelectObjectContentInput {
|
||||
SelectObjectContentInput {
|
||||
bucket: "test-bucket".to_string(),
|
||||
expected_bucket_owner: None,
|
||||
key: "test.json".to_string(),
|
||||
sse_customer_algorithm: None,
|
||||
sse_customer_key: None,
|
||||
sse_customer_key_md5: None,
|
||||
request: SelectObjectContentRequest {
|
||||
expression: sql.to_string(),
|
||||
expression_type: ExpressionType::from_static("SQL"),
|
||||
input_serialization: InputSerialization {
|
||||
json: Some(JSONInput {
|
||||
type_: Some(JSONType::from_static(JSONType::DOCUMENT)),
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
output_serialization: OutputSerialization {
|
||||
json: Some(JSONOutput::default()),
|
||||
..Default::default()
|
||||
},
|
||||
request_progress: None,
|
||||
scan_range: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_database_creation() {
|
||||
let input = create_test_input("SELECT * FROM S3Object");
|
||||
@@ -225,4 +255,170 @@ mod integration_tests {
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────
|
||||
// JSON-input variants of all the above tests
|
||||
// These exercise the JSONType::LINES (JSON lines) code path
|
||||
// ──────────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_database_creation_json() {
|
||||
let input = create_test_json_input("SELECT * FROM S3Object");
|
||||
let result = make_rustfsms(Arc::new(input), true).await;
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_global_db_creation_json() {
|
||||
let input = create_test_json_input("SELECT * FROM S3Object");
|
||||
let result = get_global_db(input.clone(), true).await;
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_select_query_json() {
|
||||
let sql = "SELECT * FROM S3Object";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let result = db.execute(&query).await;
|
||||
assert!(result.is_ok());
|
||||
|
||||
let query_handle = result.unwrap();
|
||||
let output = query_handle.result().chunk_result().await;
|
||||
assert!(output.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_select_with_where_clause_json() {
|
||||
let sql = "SELECT name, age FROM S3Object WHERE age > 30";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let result = db.execute(&query).await;
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_select_with_aggregation_json() {
|
||||
let sql = "SELECT department, COUNT(*) as count FROM S3Object GROUP BY department";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let result = db.execute(&query).await;
|
||||
// Aggregation queries may fail due to lack of actual data, which is acceptable
|
||||
match result {
|
||||
Ok(_) => {
|
||||
// If successful, that's great
|
||||
}
|
||||
Err(_) => {
|
||||
// Expected to fail due to no actual data source
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_invalid_sql_syntax_json() {
|
||||
let sql = "INVALID SQL SYNTAX";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let result = db.execute(&query).await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multi_statement_error_json() {
|
||||
let sql = "SELECT * FROM S3Object; SELECT 1;";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let result = db.execute(&query).await;
|
||||
assert!(result.is_err());
|
||||
|
||||
if let Err(QueryError::MultiStatement { num, .. }) = result {
|
||||
assert_eq!(num, 2);
|
||||
} else {
|
||||
panic!("Expected MultiStatement error");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_state_machine_workflow_json() {
|
||||
let sql = "SELECT * FROM S3Object";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let state_machine = db.build_query_state_machine(query.clone()).await;
|
||||
assert!(state_machine.is_ok());
|
||||
|
||||
let state_machine = state_machine.unwrap();
|
||||
|
||||
let logical_plan = db.build_logical_plan(state_machine.clone()).await;
|
||||
assert!(logical_plan.is_ok());
|
||||
|
||||
if let Ok(Some(plan)) = logical_plan {
|
||||
let execution_result = db.execute_logical_plan(plan, state_machine).await;
|
||||
assert!(execution_result.is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_with_limit_json() {
|
||||
let sql = "SELECT * FROM S3Object LIMIT 5";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let result = db.execute(&query).await;
|
||||
assert!(result.is_ok());
|
||||
|
||||
let query_handle = result.unwrap();
|
||||
let output = query_handle.result().chunk_result().await.unwrap();
|
||||
|
||||
let total_rows: usize = output.iter().map(|batch| batch.num_rows()).sum();
|
||||
assert!(total_rows <= 5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_with_order_by_json() {
|
||||
let sql = "SELECT name, age FROM S3Object ORDER BY age DESC";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
let query = Query::new(Context { input: Arc::new(input) }, sql.to_string());
|
||||
|
||||
let result = db.execute(&query).await;
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_queries_json() {
|
||||
let sql = "SELECT * FROM S3Object";
|
||||
let input = create_test_json_input(sql);
|
||||
let db = get_global_db(input.clone(), true).await.unwrap();
|
||||
|
||||
let mut handles = vec![];
|
||||
for i in 0..3 {
|
||||
let query = Query::new(
|
||||
Context {
|
||||
input: Arc::new(input.clone()),
|
||||
},
|
||||
format!("SELECT * FROM S3Object LIMIT {}", i + 1),
|
||||
);
|
||||
let db_clone = db.clone();
|
||||
let handle = tokio::spawn(async move { db_clone.execute(&query).await });
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
let result = handle.await.unwrap();
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user