feat: bootstrap MyDuck Server with DuckDB (apecloud#154)

TianyuZhang1214 · fanyang01 · web-flow · commit d2066c5152bd · 2024-11-14T20:18:47.000+08:00
---------

Co-authored-by: Fan Yang &lt;fanyang@apecloud.com&gt;
diff --git a/README.md b/README.md
@@ -123,6 +123,14 @@ MyDuck Server supports setting up replicas from common cloud-based MySQL offerin
 
 With MyDuck's powerful analytics capabilities, you can create an HTAP (Hybrid Transactional/Analytical Processing) system where high-frequency data writes are directed to a standard MySQL instance, while analytical queries are handled by a MyDuck Server instance. Follow our HTAP setup instructions based on [ProxySQL](docs/tutorial/htap-proxysql-setup.md) or [MariaDB MaxScale](docs/tutorial/htap-maxscale-setup.md) to easily set up an HTAP demonstration.
 
+### Query & Load Parquet Files
+
+Looking to load Parquet files into MyDuck Server and start querying? Follow our [Parquet file loading guide](docs/tutorial/load-parquet-files.md) for easy setup.
+
+### Already Using DuckDB?
+
+Already have a DuckDB file? You can seamlessly bootstrap MyDuck Server with it. See our [DuckDB file bootstrapping guide](docs/tutorial/bootstrap.md) for more details.
+
 ## 💡 Contributing
 
 Let’s make MySQL analytics fast and powerful—together!
diff --git a/catalog/database.go b/catalog/database.go
@@ -217,9 +217,9 @@ func (d *Database) RenameTable(ctx *sql.Context, oldName string, newName string)
 // extractViewDefinitions is a helper function to extract view definitions from DuckDB
 func (d *Database) extractViewDefinitions(ctx *sql.Context, schemaName string, viewName string) ([]sql.ViewDefinition, error) {
 	query := `
-		SELECT view_name, sql
+		SELECT DISTINCT view_name, sql
 		FROM duckdb_views()
-		WHERE schema_name = ?
+		WHERE schema_name = ? AND NOT internal
 	`
 	args := []interface{}{schemaName}
 
@@ -240,6 +240,12 @@ func (d *Database) extractViewDefinitions(ctx *sql.Context, schemaName string, v
 		if err := rows.Scan(&name, &createViewStmt); err != nil {
 			return nil, ErrDuckDB.New(err)
 		}
+
+		// Skip system views directly
+		if IsSystemView(name) {
+			continue
+		}
+
 		views = append(views, sql.ViewDefinition{
 			Name:                name,
 			CreateViewStatement: createViewStmt,
diff --git a/catalog/internal_tables.go b/catalog/internal_tables.go
@@ -81,21 +81,21 @@ var InternalTables = struct {
 	GlobalStatus       InternalTable
 }{
 	PersistentVariable: InternalTable{
-		Schema:       "main",
+		Schema:       "__sys__",
 		Name:         "persistent_variable",
 		KeyColumns:   []string{"name"},
 		ValueColumns: []string{"value", "vtype"},
 		DDL:          "name TEXT PRIMARY KEY, value TEXT, vtype TEXT",
 	},
 	BinlogPosition: InternalTable{
-		Schema:       "main",
+		Schema:       "__sys__",
 		Name:         "binlog_position",
 		KeyColumns:   []string{"channel"},
 		ValueColumns: []string{"position"},
 		DDL:          "channel TEXT PRIMARY KEY, position TEXT",
 	},
 	PgReplicationLSN: InternalTable{
-		Schema:       "main",
+		Schema:       "__sys__",
 		Name:         "pg_replication_lsn",
 		KeyColumns:   []string{"slot_name"},
 		ValueColumns: []string{"lsn"},
diff --git a/catalog/provider.go b/catalog/provider.go
@@ -155,7 +155,7 @@ func (prov *DatabaseProvider) AllDatabases(ctx *sql.Context) []sql.Database {
 		}
 
 		switch schemaName {
-		case "information_schema", "main", "pg_catalog":
+		case "information_schema", "pg_catalog", "__sys__":
 			continue
 		}
 
diff --git a/catalog/system_views.go b/catalog/system_views.go
@@ -0,0 +1,22 @@
+package catalog
+
+var SystemViews = map[string]struct{}{
+	"duckdb_columns":       {},
+	"duckdb_constraints":   {},
+	"duckdb_databases":     {},
+	"duckdb_indexes":       {},
+	"duckdb_schemas":       {},
+	"duckdb_tables":        {},
+	"duckdb_types":         {},
+	"duckdb_views":         {},
+	"pragma_database_list": {},
+	"sqlite_master":        {},
+	"sqlite_schema":        {},
+	"sqlite_temp_master":   {},
+	"sqlite_temp_schema":   {},
+}
+
+func IsSystemView(viewName string) bool {
+	_, ok := SystemViews[viewName]
+	return ok
+}
diff --git a/docs/data/example.db b/docs/data/example.db
diff --git a/docs/data/example.parquet b/docs/data/example.parquet
diff --git a/docs/tutorial/bootstrap.md b/docs/tutorial/bootstrap.md
@@ -0,0 +1,34 @@
+# Bootstrapping from an existing DuckDB file
+
+Given an existing DuckDB file, it is possible to bootstrap MyDuck Server with it or serve it with MyDuck Server.
+Here’s how to work with the `example.db` file located in `docs/data/`.
+
+### Steps
+
+1. **Prepare the data directory:**
+   ```bash
+   mkdir example_data
+   cp /path/to/example.db example_data/mysql.db # IMPORTANT: The attached filename must be `mysql.db`
+   ```
+
+2. **Launch MyDuck Server and attach the data directory:**
+   ```bash
+   docker run \
+   -p 13306:3306 \
+   -p 15432:5432 \
+   --volume=/path/to/example_data:/home/admin/data \ 
+   apecloud/myduckserver:main
+   ```
+
+3. **Connect to MyDuck Server and query:**
+   ```bash
+   # Using psql client & DuckDB syntax
+   psql -h 127.0.0.1 -p 15432 -U mysql <<EOF
+   SELECT * FROM "test_data";
+   EOF
+
+   # Or using MySQL client & syntax
+   mysql -h 127.0.0.1 -uroot -P13306 main <<EOF
+   SELECT * FROM `test_data`;
+   EOF
+   ```
diff --git a/docs/tutorial/load-parquet-files.md b/docs/tutorial/load-parquet-files.md
@@ -0,0 +1,41 @@
+# Query & Load Parquet Files
+
+Imagine you have a large dataset stored in Parquet files. You want to share this data with your team, enabling them to query it using SQL.
+However, these files are too large to be stored locally and too slow to download from cloud storage every time.
+You can put these files on a server that is accessible to your team and run a MyDuck Server instance on it.
+Then, your team can query the dataset easily with either a Postgres or a MySQL client.
+
+Below, we’ll show you how to query and load the `example.parquet` file from the `docs/data/` directory by attaching it into a MyDuck Server container.
+
+## Steps
+
+1. **Run MyDuck Server:**
+   ```bash
+   docker run -p 13306:3306 -p 15432:5432 \
+        -v /path/to/example.parquet:/home/admin/data/example.parquet \
+        apecloud/myduckserver:main
+   ```
+
+2. **Connect to MyDuck Server using `psql`:**
+   ```bash
+   psql -h 127.0.0.1 -p 15432 -U mysql
+   ```
+
+3. **Query the Parquet file directly:**
+   ```sql
+   SELECT * FROM '/home/admin/data/example.parquet' LIMIT 10;
+   ```
+
+4. **Load the Parquet file into a DuckDB table:**
+   ```sql
+   CREATE TABLE test_data AS SELECT * FROM '/home/admin/data/example.parquet';
+   SELECT * FROM test_data LIMIT 10;
+   ```
+
+5. **Query the data with MySQL client & syntax:**
+   ```bash
+   mysql -h 127.0.0.1 -uroot -P13306 main
+   ```
+   ```sql
+   SELECT * FROM `test_data`;
+   ```

Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,7 @@ func (prov DatabaseProvider) AllDatabases(ctx sql.Context) []sql.Database {`
`155`	`155`	`}`
`156`	`156`
`157`	`157`	`switch schemaName {`
`158`		`- case "information_schema", "main", "pg_catalog":`
	`158`	`+ case "information_schema", "pg_catalog", "__sys__":`
`159`	`159`	`continue`
`160`	`160`	`}`
`161`	`161`