Skip to content

Commit

Permalink
Merge branch 'main' into fastlanes
Browse files Browse the repository at this point in the history
  • Loading branch information
broccoliSpicy committed Sep 18, 2024
2 parents 7c21438 + 9c361fe commit 3b82ec5
Show file tree
Hide file tree
Showing 138 changed files with 2,123 additions and 476 deletions.
21 changes: 20 additions & 1 deletion .github/workflows/cargo-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ on:
# Use released instead of published, since we don't publish preview/beta
# versions. Users instead install them from the git repo.
types: [released]
workflow_dispatch:
inputs:
tag:
description: 'Tag to publish (e.g., v1.0.0)'
required: true
type: string

env:
# This env var is used by Swatinem/rust-cache@v2 for the cache
Expand All @@ -14,7 +20,7 @@ env:
jobs:
build:
runs-on: ubuntu-latest
timeout-minutes: 30
timeout-minutes: 60
env:
# Need up-to-date compilers for kernels
CC: gcc-12
Expand All @@ -27,6 +33,19 @@ jobs:
- uses: Swatinem/rust-cache@v2
with:
workspaces: rust
- name: Verify and checkout specified tag
if: github.event_name == 'workflow_dispatch'
run: |
git fetch --all --tags
if git rev-parse ${{ github.event.inputs.tag }} >/dev/null 2>&1; then
git checkout ${{ github.event.inputs.tag }}
echo "Successfully checked out tag ${{ github.event.inputs.tag }}"
else
echo "Error: Tag ${{ github.event.inputs.tag }} does not exist"
echo "Available tags:"
git tag -l
exit 1
fi
- name: Install dependencies
run: |
sudo apt update
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/pr-title.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,11 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: pip install PyGithub
- env:
PR_NUMBER: ${{ github.event.pull_request.number }}
working-directory: pr
run: |
pip install PyGithub
python ../base/ci/check_versions.py
run: python ../base/ci/check_versions.py
commitlint:
permissions:
pull-requests: write
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:
fail_ci_if_error: false
linux-arm:
runs-on: warp-ubuntu-latest-arm64-4x
timeout-minutes: 30
timeout-minutes: 45
steps:
- uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
Expand Down Expand Up @@ -160,7 +160,7 @@ jobs:
workspaces: rust
- name: Select new xcode
# Default XCode right now is 15.0.1, which contains a bug that causes
# backtraces to not show properly. See:
# backtraces to not show properly. See:
# https://github.com/rust-lang/rust/issues/113783
run: sudo xcode-select -s /Applications/Xcode_15.4.app
- name: Install dependencies
Expand All @@ -171,7 +171,7 @@ jobs:
rustup component add rustfmt
- name: Run tests
# Check all benches, even though we aren't going to run them.
run: |
run: |
cargo build --tests --benches --all-features --workspace
cargo test --all-features
windows-build:
Expand Down
33 changes: 17 additions & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ exclude = ["python"]
resolver = "2"

[workspace.package]
version = "0.17.1"
version = "0.18.1"
edition = "2021"
authors = ["Lance Devs <dev@lancedb.com>"]
license = "Apache-2.0"
Expand All @@ -44,20 +44,21 @@ categories = [
rust-version = "1.78"

[workspace.dependencies]
lance = { version = "=0.17.1", path = "./rust/lance" }
lance-arrow = { version = "=0.17.1", path = "./rust/lance-arrow" }
lance-core = { version = "=0.17.1", path = "./rust/lance-core" }
lance-datafusion = { version = "=0.17.1", path = "./rust/lance-datafusion" }
lance-datagen = { version = "=0.17.1", path = "./rust/lance-datagen" }
lance-encoding = { version = "=0.17.1", path = "./rust/lance-encoding" }
lance-encoding-datafusion = { version = "=0.17.1", path = "./rust/lance-encoding-datafusion" }
lance-file = { version = "=0.17.1", path = "./rust/lance-file" }
lance-index = { version = "=0.17.1", path = "./rust/lance-index" }
lance-io = { version = "=0.17.1", path = "./rust/lance-io" }
lance-linalg = { version = "=0.17.1", path = "./rust/lance-linalg" }
lance-table = { version = "=0.17.1", path = "./rust/lance-table" }
lance-test-macros = { version = "=0.17.1", path = "./rust/lance-test-macros" }
lance-testing = { version = "=0.17.1", path = "./rust/lance-testing" }
lance = { version = "=0.18.1", path = "./rust/lance" }
lance-arrow = { version = "=0.18.1", path = "./rust/lance-arrow" }
lance-core = { version = "=0.18.1", path = "./rust/lance-core" }
lance-datafusion = { version = "=0.18.1", path = "./rust/lance-datafusion" }
lance-datagen = { version = "=0.18.1", path = "./rust/lance-datagen" }
lance-encoding = { version = "=0.18.1", path = "./rust/lance-encoding" }
lance-encoding-datafusion = { version = "=0.18.1", path = "./rust/lance-encoding-datafusion" }
lance-file = { version = "=0.18.1", path = "./rust/lance-file" }
lance-index = { version = "=0.18.1", path = "./rust/lance-index" }
lance-io = { version = "=0.18.1", path = "./rust/lance-io" }
lance-jni = { version = "=0.18.1", path = "./java/core/lance-jni" }
lance-linalg = { version = "=0.18.1", path = "./rust/lance-linalg" }
lance-table = { version = "=0.18.1", path = "./rust/lance-table" }
lance-test-macros = { version = "=0.18.1", path = "./rust/lance-test-macros" }
lance-testing = { version = "=0.18.1", path = "./rust/lance-testing" }
approx = "0.5.1"
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false, features = ["prettyprint"] }
Expand Down Expand Up @@ -110,7 +111,7 @@ datafusion-physical-expr = { version = "40.0", features = [
] }
deepsize = "0.2.0"
either = "1.0"
fsst = { version = "=0.17.1", path = "./rust/lance-encoding/compression-algo/fsst" }
fsst = { version = "=0.18.1", path = "./rust/lance-encoding/compression-algo/fsst" }
futures = "0.3"
http = "0.2.9"
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
Expand Down
12 changes: 10 additions & 2 deletions ci/check_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,16 @@ def parse_version(version: str) -> tuple[int, int, int]:
# Check for a breaking-change label in the PRs between the last release and the current commit.
commits = repo.compare(latest_release.tag_name, os.environ["GITHUB_SHA"]).commits
prs = (pr for commit in commits for pr in commit.get_pulls())
pr_labels = (label.name for pr in prs for label in pr.labels)
has_breaking_changes = any(label == "breaking-change" for label in pr_labels)
has_breaking_changes = False
for pr in prs:
pr_labels = (label.name for label in pr.labels)
if any(label == "breaking-change" for label in pr_labels):
has_breaking_changes = True
print(f"Found breaking change in PR #{pr.number}: {pr.title}")
print(f" {pr.html_url}")
break
else:
print("No breaking changes found.")

if os.environ.get("PR_NUMBER"):
# If we're running on a PR, we should validate that the version has been
Expand Down
4 changes: 4 additions & 0 deletions docs/read_and_write.rst
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,10 @@ These options apply to all object stores.
- Description
* - ``allow_http``
- Allow non-TLS, i.e. non-HTTPS connections. Default, ``False``.
* - ``download_retry_count``
- Number of times to retry a download. Default, ``3``. This limit is applied when
the HTTP request succeeds but the response is not fully downloaded, typically due
to a violation of ``request_timeout``.
* - ``allow_invalid_certificates``
- Skip certificate validation on https connections. Default, ``False``.
Warning: This is insecure and should only be used for testing.
Expand Down
6 changes: 3 additions & 3 deletions java/core/lance-jni/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ crate-type = ["cdylib"]

[dependencies]
lance = { workspace = true, features = ["substrait"] }
lance-encoding = { path = "../../../rust/lance-encoding" }
lance-linalg = { path = "../../../rust/lance-linalg" }
lance-index = { path = "../../../rust/lance-index" }
lance-encoding = { workspace = true }
lance-linalg = { workspace = true }
lance-index = { workspace = true }
lance-io.workspace = true
arrow = { workspace = true, features = ["ffi"] }
arrow-schema.workspace = true
Expand Down
2 changes: 1 addition & 1 deletion java/core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lance-parent</artifactId>
<version>0.0.4</version>
<version>0.18.1</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion java/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.lancedb</groupId>
<artifactId>lance-parent</artifactId>
<version>0.0.4</version>
<version>0.18.1</version>
<packaging>pom</packaging>

<name>Lance Parent</name>
Expand Down
8 changes: 4 additions & 4 deletions java/spark/README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Spark-Lance Connector

The Spark-Lance Connector allows Apache Spark to efficiently read tables stored in Lance format.
The Spark-Lance Connector allows Apache Spark to efficiently read datasets stored in Lance format.
Lance is a modern columnar data format optimized for machine learning workflows and datasets,
supporting distributed, parallel scans, and optimizations such as column and filter pushdown to improve performance.
Additionally, Lance provides high-performance random access that is 100 times faster than Parquet without sacrificing scan performance.
By using the Spark-Lance Connector, you can leverage Spark's powerful data processing, SQL querying, and machine learning training capabilities on the AI data lake powered by Lance.

## Features

* Query Lance Tables: Seamlessly query tables stored in the Lance format using Spark.
* Distributed, Parallel Scans: Leverage Spark's distributed computing capabilities to perform parallel scans on Lance tables.
* Query Lance Datasets: Seamlessly query datasets stored in the Lance format using Spark.
* Distributed, Parallel Scans: Leverage Spark's distributed computing capabilities to perform parallel scans on Lance datasets.
* Column and Filter Pushdown: Optimize query performance by pushing down column selections and filters to the data source.

## Installation
Expand Down Expand Up @@ -49,7 +49,7 @@ SparkSession spark = SparkSession.builder()

Dataset<Row> data = spark.read().format("lance")
.option("db", "/path/to/example_db")
.option("table", "lance_example_table")
.option("dataset", "lance_example_dataset")
.load();

data.show(100)
Expand Down
4 changes: 2 additions & 2 deletions java/spark/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lance-parent</artifactId>
<version>0.0.4</version>
<version>0.18.1</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down Expand Up @@ -40,7 +40,7 @@
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lance-core</artifactId>
<version>0.0.4</version>
<version>0.18.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
Expand Down
85 changes: 85 additions & 0 deletions java/spark/src/main/java/com/lancedb/lance/spark/LanceCatalog.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.lancedb.lance.spark;

import com.lancedb.lance.spark.internal.LanceDatasetAdapter;
import com.lancedb.lance.spark.utils.Optional;
import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.Table;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.catalog.TableChange;
import org.apache.spark.sql.connector.expressions.Transform;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;
import scala.Some;

import java.util.Map;

public class LanceCatalog implements TableCatalog {
@Override
public Identifier[] listTables(String[] namespace) throws NoSuchNamespaceException {
throw new UnsupportedOperationException("Please use lancedb catalog for dataset listing");
}

@Override
public Table loadTable(Identifier ident) throws NoSuchTableException {
LanceConfig config = LanceConfig.from(ident.name());
Optional<StructType> schema = LanceDatasetAdapter.getSchema(ident.name());
if (schema.isEmpty()) {
throw new NoSuchTableException(config.getDbPath(), config.getDatasetName());
}
return new LanceDataset(LanceConfig.from(ident.name()), schema.get());
}

@Override
public Table createTable(Identifier ident, StructType schema, Transform[] partitions,
Map<String, String> properties) throws TableAlreadyExistsException, NoSuchNamespaceException {
try {
LanceDatasetAdapter.createDataset(ident.name(), schema);
} catch (IllegalArgumentException e) {
throw new TableAlreadyExistsException(ident.name(), new Some<>(e));
}
return new LanceDataset(LanceConfig.from(properties, ident.name()), schema);
}

@Override
public Table alterTable(Identifier ident, TableChange... changes) throws NoSuchTableException {
throw new UnsupportedOperationException();
}

@Override
public boolean dropTable(Identifier ident) {
throw new UnsupportedOperationException();
}

@Override
public void renameTable(Identifier oldIdent, Identifier newIdent)
throws NoSuchTableException, TableAlreadyExistsException {
throw new UnsupportedOperationException();
}

@Override
public void initialize(String name, CaseInsensitiveStringMap options) {
// Do nothing here
}

@Override
public String name() {
return "lance";
}
}
Loading

0 comments on commit 3b82ec5

Please sign in to comment.