diff --git a/.gitignore b/.gitignore index 95a19037..c1a8123b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,8 @@ package-lock.json *.tgz .vscode *.parquet +*.csv +*.jsonl /coverage/ /lib/ diff --git a/package.json b/package.json index 937dc70c..4f9cb3fb 100644 --- a/package.json +++ b/package.json @@ -62,9 +62,9 @@ "squirreling": "0.7.9" }, "devDependencies": { - "@storybook/react-vite": "10.2.2", + "@storybook/react-vite": "10.2.3", "@testing-library/react": "16.3.2", - "@types/node": "25.1.0", + "@types/node": "25.2.0", "@types/react": "19.2.10", "@types/react-dom": "19.2.3", "@vitejs/plugin-react": "5.1.2", @@ -72,15 +72,15 @@ "eslint": "9.39.2", "eslint-plugin-react": "7.37.5", "eslint-plugin-react-hooks": "7.0.1", - "eslint-plugin-react-refresh": "0.4.26", - "eslint-plugin-storybook": "10.2.2", - "globals": "17.2.0", + "eslint-plugin-react-refresh": "0.5.0", + "eslint-plugin-storybook": "10.2.3", + "globals": "17.3.0", "jsdom": "27.4.0", "nodemon": "3.1.11", "npm-run-all": "4.1.5", "react": "19.2.4", "react-dom": "19.2.4", - "storybook": "10.2.2", + "storybook": "10.2.3", "typescript": "5.9.3", "typescript-eslint": "8.54.0", "vite": "7.3.1", diff --git a/src/components/ParquetView/ParquetView.module.css b/src/components/TableView/TableView.module.css similarity index 100% rename from src/components/ParquetView/ParquetView.module.css rename to src/components/TableView/TableView.module.css diff --git a/src/components/ParquetView/ParquetView.tsx b/src/components/TableView/TableView.tsx similarity index 80% rename from src/components/ParquetView/ParquetView.tsx rename to src/components/TableView/TableView.tsx index 82dbd6bb..86b94e50 100644 --- a/src/components/ParquetView/ParquetView.tsx +++ b/src/components/TableView/TableView.tsx @@ -1,16 +1,15 @@ -import HighTable, { DataFrame, sortableDataFrame } from 'hightable' +import HighTable, { DataFrame } from 'hightable' import 'hightable/src/HighTable.css' -import { asyncBufferFromUrl, parquetMetadataAsync } from 'hyparquet' import React, { useCallback, useEffect, useState } from 'react' import { useConfig } from '../../hooks/useConfig.js' import { appendSearchParams } from '../../lib/routes.js' import { FileSource } from '../../lib/sources/types.js' -import { parquetDataFrame } from '../../lib/tableProvider.js' +import { tableProvider } from '../../lib/tableProvider.js' import { cn } from '../../lib/utils.js' import CellPanel from '../CellPanel/CellPanel.js' import ContentWrapper, { ContentSize } from '../ContentWrapper/ContentWrapper.js' import SlidePanel from '../SlidePanel/SlidePanel.js' -import styles from './ParquetView.module.css' +import styles from './TableView.module.css' interface ViewerProps { source: FileSource @@ -23,27 +22,22 @@ interface Content extends ContentSize { } /** - * Parquet file viewer + * Table file viewer for parquet, CSV, and JSONL files */ -export default function ParquetView({ source, setProgress, setError }: ViewerProps) { +export default function TableView({ source, setProgress, setError }: ViewerProps) { const [isLoading, setIsLoading] = useState(true) const [content, setContent] = useState() const [cell, setCell] = useState<{ row: number, col: number } | undefined>() const { customClass, routes } = useConfig() useEffect(() => { - async function loadParquetDataFrame() { + async function loadDataFrame() { try { setIsLoading(true) - setProgress(0.33) - const { resolveUrl, requestInit } = source - const asyncBuffer = await asyncBufferFromUrl({ url: resolveUrl, requestInit }) - const from = { url: resolveUrl, byteLength: asyncBuffer.byteLength, requestInit } - setProgress(0.66) - const metadata = await parquetMetadataAsync(asyncBuffer) - const dataframe = sortableDataFrame(parquetDataFrame(from, metadata)) - const fileSize = asyncBuffer.byteLength - setContent({ dataframe, fileSize }) + setProgress(0.5) + const { resolveUrl, fileName, requestInit } = source + const dataframe = await tableProvider({ url: resolveUrl, fileName, requestInit }) + setContent({ dataframe }) } catch (error) { setError(error) } finally { @@ -51,7 +45,7 @@ export default function ParquetView({ source, setProgress, setError }: ViewerPro setProgress(1) } } - void loadParquetDataFrame() + void loadDataFrame() }, [setError, setProgress, source]) // Close cell view on escape key diff --git a/src/components/Viewer/Viewer.tsx b/src/components/Viewer/Viewer.tsx index 47388fe2..0f92569a 100644 --- a/src/components/Viewer/Viewer.tsx +++ b/src/components/Viewer/Viewer.tsx @@ -4,7 +4,7 @@ import AvroView from '../AvroView/AvroView.js' import ImageView from '../ImageView/ImageView.js' import JsonView from '../JsonView/JsonView.js' import MarkdownView from '../MarkdownView/MarkdownView.js' -import TableView from '../ParquetView/ParquetView.js' +import TableView from '../TableView/TableView.js' import TextView from '../TextView/TextView.js' interface ViewerProps { @@ -21,7 +21,7 @@ export default function Viewer({ source, setError, setProgress }: ViewerProps) { const { fileName } = source if (fileName.endsWith('.md')) { return - } else if (fileName.endsWith('.parquet')) { + } else if (fileName.endsWith('.parquet') || fileName.endsWith('.csv') || fileName.endsWith('.jsonl')) { return } else if (fileName.endsWith('.json')) { return diff --git a/src/components/index.ts b/src/components/index.ts index 3aec67f0..94a071eb 100644 --- a/src/components/index.ts +++ b/src/components/index.ts @@ -14,7 +14,7 @@ import Layout from './Layout/Layout.js' import Markdown from './Markdown/Markdown.js' import MarkdownView from './MarkdownView/MarkdownView.js' import Page from './Page/Page.js' -import ParquetView from './ParquetView/ParquetView.js' +import TableView from './TableView/TableView.js' import ProgressBar from './ProgressBar/ProgressBar.js' import SlidePanel from './SlidePanel/SlidePanel.js' import Spinner from './Spinner/Spinner.js' @@ -40,7 +40,7 @@ export { Markdown, MarkdownView, Page, - ParquetView, + TableView, ProgressBar, SlidePanel, Spinner, diff --git a/src/lib/csv.ts b/src/lib/csv.ts new file mode 100644 index 00000000..f408a5f9 --- /dev/null +++ b/src/lib/csv.ts @@ -0,0 +1,60 @@ +/** + * Parse CSV text into nested array of rows and columns. + */ +export function parseCsv(text: string): string[][] { + const rows = [] + let row = [] + let field = '' + let inQuotes = false + let previousCharWasQuote = false + + for (const char of text) { + + if (inQuotes && char === '"' && !previousCharWasQuote) { + // first quote, wait to see if it's escaped or end of field + previousCharWasQuote = true + } else if (inQuotes && char === '"' && previousCharWasQuote) { + // csv escaped quote ## + field += char + previousCharWasQuote = false + } else if (inQuotes && !previousCharWasQuote) { + // append quoted character to field + field += char + } else { + // not in quotes + inQuotes = false + previousCharWasQuote = false + switch (char) { + case ',': + // emit column + row.push(field) + field = '' + break + case '\n': + // emit row + row.push(field) + rows.push(row) + row = [] + field = '' + break + case '"': + inQuotes = true + break + default: + field += char + } + } + } + + if (inQuotes && !previousCharWasQuote) { + console.error('csv unterminated quote') + } + + // handle last field and row, but skip empty last line + if (field || row.length) { + row.push(field) + rows.push(row) + } + + return rows +} diff --git a/src/lib/index.ts b/src/lib/index.ts index b53c1f9e..28478bd2 100644 --- a/src/lib/index.ts +++ b/src/lib/index.ts @@ -1,6 +1,7 @@ export { appendSearchParams, replaceSearchParams } from './routes.js' export * from './sources/index.js' -export { parquetDataFrame } from './tableProvider.js' +export { parseCsv } from './csv.js' +export { csvDataFrame, jsonLinesDataFrame, parquetDataFrame, tableProvider } from './tableProvider.js' export { asyncBufferFrom, cn, contentTypes, formatFileSize, getFileDate, getFileDateShort, imageTypes, parseFileSize } from './utils.js' export { parquetQueryWorker, parquetReadObjectsWorker, parquetReadWorker } from './workers/parquetWorkerClient.js' export type { AsyncBufferFrom } from './workers/types.js' diff --git a/src/lib/tableProvider.ts b/src/lib/tableProvider.ts index f65e6916..b0c5a43c 100644 --- a/src/lib/tableProvider.ts +++ b/src/lib/tableProvider.ts @@ -1,9 +1,38 @@ -import { DataFrame, DataFrameEvents, ResolvedValue, checkSignal, createEventTarget, validateFetchParams, validateGetCellParams, validateGetRowNumberParams } from 'hightable' +import { DataFrame, DataFrameEvents, ResolvedValue, arrayDataFrame, checkSignal, createEventTarget, sortableDataFrame, validateFetchParams, validateGetCellParams, validateGetRowNumberParams } from 'hightable' import type { ColumnData } from 'hyparquet' -import { FileMetaData, ParquetReadOptions, parquetSchema } from 'hyparquet' +import { FileMetaData, ParquetReadOptions, asyncBufferFromUrl, parquetMetadataAsync, parquetSchema } from 'hyparquet' +import { parseCsv } from './csv.js' import { parquetReadWorker } from './workers/parquetWorkerClient.js' import type { AsyncBufferFrom } from './workers/types.d.ts' +interface TableProviderOptions { + url: string + fileName: string + requestInit?: RequestInit +} + +/** + * Create a dataframe from a file URL, automatically detecting the file type. + * Supports parquet, CSV, and JSONL files. + */ +export async function tableProvider({ url, fileName, requestInit }: TableProviderOptions): Promise { + const asyncBuffer = await asyncBufferFromUrl({ url, requestInit }) + const from = { url, byteLength: asyncBuffer.byteLength, requestInit } + + const baseName = fileName.toLowerCase() + if (baseName.endsWith('.csv')) { + return csvDataFrame(from) + } + + if (baseName.endsWith('.jsonl')) { + return jsonLinesDataFrame(from) + } + + // Default to parquet + const metadata = await parquetMetadataAsync(asyncBuffer) + return sortableDataFrame(parquetDataFrame(from, metadata)) +} + type GroupStatus = { kind: 'unfetched' } | { @@ -130,3 +159,49 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, return unsortableDataFrame } + +/** + * Convert a CSV file into a sortable dataframe. + * + * Parses the entire file and creates a sortable dataframe. + * The first row is treated as the header. + */ +export async function csvDataFrame(from: AsyncBufferFrom): Promise { + let buffer: ArrayBuffer + if ('file' in from) { + buffer = await from.file.arrayBuffer() + } else { + const response = await fetch(from.url, from.requestInit) + buffer = await response.arrayBuffer() + } + + const text = new TextDecoder().decode(buffer) + const lines = parseCsv(text) + const header = lines[0] ?? [] + const rows = lines.slice(1).map(row => { + return Object.fromEntries(header.map((key, i) => [key, row[i]])) + }) + return sortableDataFrame(arrayDataFrame(rows)) +} + +/** + * Convert a JSONL file into a sortable dataframe. + * + * Parses each line as a JSON object and creates a sortable dataframe. + */ +export async function jsonLinesDataFrame(from: AsyncBufferFrom): Promise { + let buffer: ArrayBuffer + if ('file' in from) { + buffer = await from.file.arrayBuffer() + } else { + const response = await fetch(from.url, from.requestInit) + buffer = await response.arrayBuffer() + } + + const text = new TextDecoder().decode(buffer).trimEnd() + const lines = text.split('\n').filter(line => line.trim()) + const rows: Record[] = lines.map(line => { + return line ? JSON.parse(line) as Record : {} + }) + return sortableDataFrame(arrayDataFrame(rows)) +} diff --git a/test/lib/csv.test.ts b/test/lib/csv.test.ts new file mode 100644 index 00000000..6b0cea79 --- /dev/null +++ b/test/lib/csv.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it, vi } from 'vitest' +import { parseCsv } from '../../src/index.js' + +describe('parseCsv', () => { + it('parses simple CSV', () => { + const csv = 'Name,Age,Occupation\nAlice,30,Engineer\nBob,25,Designer' + const expected = [ + ['Name', 'Age', 'Occupation'], + ['Alice', '30', 'Engineer'], + ['Bob', '25', 'Designer'], + ] + expect(parseCsv(csv)).toEqual(expected) + }) + + it('ignores empty last line', () => { + const csv = 'Name,Age,Occupation\nAlice,30,Engineer\n' + const expected = [ + ['Name', 'Age', 'Occupation'], + ['Alice', '30', 'Engineer'], + ] + expect(parseCsv(csv)).toEqual(expected) + }) + + it('handles quoted fields', () => { + const csv = 'Name,Age,Occupation\n"Alice, PhD",30,Engineer\nBob,25,"Designer, Senior"' + const expected = [ + ['Name', 'Age', 'Occupation'], + ['Alice, PhD', '30', 'Engineer'], + ['Bob', '25', 'Designer, Senior'], + ] + expect(parseCsv(csv)).toEqual(expected) + }) + + it('handles escaped quotes', () => { + const csv = 'Name,Quote\nAlice,"She said, ""Hello world"""\nBob,"This is ""an example"" of quotes"' + const expected = [ + ['Name', 'Quote'], + ['Alice', 'She said, "Hello world"'], + ['Bob', 'This is "an example" of quotes'], + ] + expect(parseCsv(csv)).toEqual(expected) + }) + + it('handles newlines within quoted fields', () => { + const csv = 'Name,Address\nAlice,"123 Main St.\nAnytown, USA"' + const expected = [ + ['Name', 'Address'], + ['Alice', '123 Main St.\nAnytown, USA'], + ] + expect(parseCsv(csv)).toEqual(expected) + }) + + it('handles unterminated quotes', () => { + const csv = 'Name,Quote\nAlice,"This is an unterminated quote\n' + const expected = [ + ['Name', 'Quote'], + ['Alice', 'This is an unterminated quote\n'], + ] + vi.spyOn(console, 'error') + expect(parseCsv(csv)).toEqual(expected) + expect(console.error).toHaveBeenCalledWith('csv unterminated quote') + }) +})