diff --git a/airflow/olake_sync_from_source.py b/airflow/olake_sync_from_source.py index 3db5eb4e..14e99bf3 100644 --- a/airflow/olake_sync_from_source.py +++ b/airflow/olake_sync_from_source.py @@ -14,11 +14,11 @@ # This connection tells Airflow how to authenticate with your K8s cluster. KUBERNETES_CONN_ID = "kubernetes_default" # <-- EDIT THIS LINE -# !!! IMPORTANT: Set this to the Kubernetes namespace where Olake pods should run !!! +# !!! IMPORTANT: Set this to the Kubernetes namespace where OLake pods should run !!! # Ensure ConfigMaps and the PVC exist or will be created in this namespace. TARGET_NAMESPACE = "olake" # <-- EDIT THIS LINE -# !!! IMPORTANT: Set this to the correct Olake image for your source database !!! +# !!! IMPORTANT: Set this to the correct OLake image for your source database !!! # Find images at: https://hub.docker.com/u/olakego # Examples: "olakego/source-mongodb:latest", "olakego/source-mysql:latest", "olakego/source-postgres:latest" OLAKE_IMAGE = "olakego/source-db:latest" # <-- EDIT THIS LINE @@ -54,9 +54,9 @@ # Generic tags tags=["kubernetes", "olake", "etl", "sync"], doc_md=""" - ### Olake Sync DAG + ### OLake Sync DAG - This DAG runs the Olake `sync` command using pre-created ConfigMaps + This DAG runs the OLake `sync` command using pre-created ConfigMaps for source, destination, and streams configuration. It ensures a persistent volume claim exists before running the sync task. @@ -249,7 +249,7 @@ def create_pvc_with_hook(**context): ), ], - # Use the container's default entrypoint (should be the Olake binary) + # Use the container's default entrypoint (should be the OLake binary) cmds=None, # Pass arguments for the 'sync' command arguments=[ diff --git a/airflow/olake_sync_from_source_ec2.py b/airflow/olake_sync_from_source_ec2.py index 8142dff2..a3b2f4b9 100644 --- a/airflow/olake_sync_from_source_ec2.py +++ b/airflow/olake_sync_from_source_ec2.py @@ -237,7 +237,7 @@ def run_olake_docker_via_ssh(ti, ssh_conn_id, command): fi echo "INFO: State file uploaded successfully." -# Now check the Olake exit code +# Now check the OLake exit code if [ $OLAKE_EXIT_CODE -ne 0 ]; then echo "ERROR: ETL job failed with exit code $OLAKE_EXIT_CODE." exit $OLAKE_EXIT_CODE diff --git a/blog/2025-01-07-olake-architecture.mdx b/blog/2025-01-07-olake-architecture.mdx index d0acd3df..3f7c11e8 100644 --- a/blog/2025-01-07-olake-architecture.mdx +++ b/blog/2025-01-07-olake-architecture.mdx @@ -17,7 +17,7 @@ update: [18.02.2025] When building [OLake](https://olake.io/), our goal was simple: *Fastest DB to Data LakeHouse (Apache Iceberg to start) data pipeline.* -Checkout GtiHub repository for OLake - [https://github.com/datazip-inc/olake](https://github.com/datazip-inc/olake) +Checkout GitHub repository for OLake - [https://github.com/datazip-inc/olake](https://github.com/datazip-inc/olake) Over time, many of us whoโ€™ve worked with data pipelines have dealt with the toil of building one-off ETL scripts, battling performance bottlenecks, or worrying about vendor lock-in. diff --git a/blog/2025-04-23-how-to-set-up-postgresql-cdc-on-aws-rds.mdx b/blog/2025-04-23-how-to-set-up-postgresql-cdc-on-aws-rds.mdx index 908821cf..209faf6a 100644 --- a/blog/2025-04-23-how-to-set-up-postgresql-cdc-on-aws-rds.mdx +++ b/blog/2025-04-23-how-to-set-up-postgresql-cdc-on-aws-rds.mdx @@ -251,7 +251,7 @@ Essential monitoring includes: ### **Does OLake take care of Full-historical snapshot/replication before CDC? How fast is it?** -OLake has fastest optimised historical load: +OLake has fastest optimized historical load: - OLake has Historical-load + CDC mode for this - Tables are chunked into smaller pieces to make it parallel and recoverable from failures - Any new table additions is also taken care of automatically. diff --git a/blog/2025-05-08-olake-airflow-on-ec2.mdx b/blog/2025-05-08-olake-airflow-on-ec2.mdx index 9e1b8ac8..b119d126 100644 --- a/blog/2025-05-08-olake-airflow-on-ec2.mdx +++ b/blog/2025-05-08-olake-airflow-on-ec2.mdx @@ -14,7 +14,7 @@ tags: [olake] At OLake, we're building tools to make data integration seamless. Today, we're excited to show you how to leverage your existing Apache Airflow setup to automate OLake data synchronization tasks directly on your EC2 Server! -Olake is designed to efficiently sync data from various sources to your chosen destinations. This guide provides an Airflow DAG (Directed Acyclic Graph) that orchestrates the Olake sync command by provisioning a dedicated EC2 instance, executing Olake within a Docker container and handling configuration and state persistence through Amazon S3. +OLake is designed to efficiently sync data from various sources to your chosen destinations. This guide provides an Airflow DAG (Directed Acyclic Graph) that orchestrates the OLake sync command by provisioning a dedicated EC2 instance, executing OLake within a Docker container and handling configuration and state persistence through Amazon S3. This post assumes you already have: @@ -46,7 +46,7 @@ Before deploying the DAG, ensure the following are in place: * Click on the + icon to `Add a new record ` * Select the `Connection Type` to be `Amazon Web Services ` * Enter a `Connection Id` (this would be later used in `AWS_CONNECTION_ID` variable in the DAG) - * **(Important)** Either enter `AWS Access Key Id` and `AWS Secret Access Key` or user can just attach an AWS IAM Role to the Airflow instance (with sufficient permissions as below code snippet). If no Access Keys are used, default boto3 behaviour is used. + * **(Important)** Either enter `AWS Access Key Id` and `AWS Secret Access Key` or user can just attach an AWS IAM Role to the Airflow instance (with sufficient permissions as below code snippet). If no Access Keys are used, default boto3 behavior is used. * Click **Save**. @@ -132,7 +132,7 @@ Before deploying the DAG, ensure the following are in place: ``` -* **SSH Connection (`SSH_CONNECTION_ID` in the DAG):** This connection allows Airflow to securely connect to the dynamically created EC2 instance to execute the Olake setup and run commands. +* **SSH Connection (`SSH_CONNECTION_ID` in the DAG):** This connection allows Airflow to securely connect to the dynamically created EC2 instance to execute the OLake setup and run commands. * Still in the Airflow UI (`Admin` -> `Connections`), click the `+` icon to add another new record. * Set the **Connection Type** to **SSH**. * Enter a **Connection Id** (e.g., `ssh_ec2_olake`). This exact ID will be used for the `SSH_CONNECTION_ID` variable in your DAG. @@ -172,11 +172,11 @@ Before deploying the DAG, ensure the following are in place: -#### 3. **Amazon S3 Setup for Olake Configurations and State:** -* **S3 Bucket (`S3_BUCKET_NAME` in the DAG):** Create an S3 bucket where Olake's configuration files and persistent state file will be stored. -* **S3 Prefix for Configurations (`S3_PREFIX` in the DAG):** Decide on a "folder" (S3 prefix) within your bucket where your Olake configuration files will reside (e.g., `olake/projectA/configs/`). +#### 3. **Amazon S3 Setup for OLake Configurations and State:** +* **S3 Bucket (`S3_BUCKET_NAME` in the DAG):** Create an S3 bucket where OLake's configuration files and persistent state file will be stored. +* **S3 Prefix for Configurations (`S3_PREFIX` in the DAG):** Decide on a "folder" (S3 prefix) within your bucket where your OLake configuration files will reside (e.g., `olake/projectA/configs/`). -* **Upload Olake Configuration Files:** Before running the DAG, you must upload your Olake `source.json`, `streams.json`, and `destination.json` files to the S3 bucket under the prefix you defined. The DAG's SSH script will sync these files to the EC2 instance. Please visit[ OLake Docs](https://olake.io/docs) website to learn how the[ source](https://olake.io/docs/connectors/overview) and[ destinations](https://olake.io/docs/writers/overview) can be set up. +* **Upload OLake Configuration Files:** Before running the DAG, you must upload your OLake `source.json`, `streams.json`, and `destination.json` files to the S3 bucket under the prefix you defined. The DAG's SSH script will sync these files to the EC2 instance. Please visit[ OLake Docs](https://olake.io/docs) website to learn how the[ source](https://olake.io/docs/connectors/overview) and[ destinations](https://olake.io/docs/writers/overview) can be set up. We need to generate `streams.json` beforehand using the OLake `discover` command against your source database. * Streams Generation Guides: @@ -185,9 +185,9 @@ We need to generate `streams.json` beforehand using the OLake `discover` command * The content of this file will be placed within the `streams.json` file. #### 4. **EC2 Instance IAM Role (`IAM_ROLE_NAME` in the DAG):** -The EC2 instances launched by Airflow (which will act as the worker nodes for Olake) need their own set of permissions to perform their tasks. This is achieved by assigning them an IAM Instance Profile. This instance profile must have an attached IAM policy granting permissions to: -* Access Amazon S3 to download Olake configuration files. -* Access Amazon S3 to read and write the Olake state file. +The EC2 instances launched by Airflow (which will act as the worker nodes for OLake) need their own set of permissions to perform their tasks. This is achieved by assigning them an IAM Instance Profile. This instance profile must have an attached IAM policy granting permissions to: +* Access Amazon S3 to download OLake configuration files. +* Access Amazon S3 to read and write the OLake state file. ```json # s3_access_policy.json @@ -290,7 +290,7 @@ OLAKE_IMAGE = "DOCKER_IMAGE_NAME" ## Recap of Values to Change: -To ensure the DAG runs correctly in your environment, you **must** update the following placeholder variables in the `olake_sync_from_source_ec2.py` (or your DAG file name) with your specific AWS and Olake details: +To ensure the DAG runs correctly in your environment, you **must** update the following placeholder variables in the `olake_sync_from_source_ec2.py` (or your DAG file name) with your specific AWS and OLake details: @@ -303,7 +303,7 @@ To ensure the DAG runs correctly in your environment, you **must** update the fo ### **EC2 Instance Configuration:** * `AMI_ID`: Replace with the actual AMI ID of a container-ready image (with Docker/containerd, aws-cli, jq) in your chosen `AWS_REGION_NAME`. -* `INSTANCE_TYPE`: (Optional) Select an appropriate EC2 instance type based on your Olake workload's resource needs (e.g., `t3.medium`, `m5.large`, or an ARM equivalent like `t4g.medium`). \ +* `INSTANCE_TYPE`: (Optional) Select an appropriate EC2 instance type based on your OLake workload's resource needs (e.g., `t3.medium`, `m5.large`, or an ARM equivalent like `t4g.medium`). \ The AMI tag we have hardcoded is EKS supported Ubuntu image with containerd and aws-cli pre-installed which are very crucial for the DAG to work. Another point to note is that since Graviton powered machines are cheaper compared to x86 machines, so the AMI already uses ARM architecture AMI. * `KEY_NAME`: Enter the name of the EC2 Key Pair you want to associate with the launched instances. This is the same key we have used while setting up the SSH Connection. * `SUBNET_ID`: Provide the ID of the VPC subnet where the EC2 instance should be launched. @@ -311,11 +311,11 @@ The AMI tag we have hardcoded is EKS supported Ubuntu image with containerd and * `IAM_ROLE_NAME`: Enter the **name** (not the ARN) of the IAM Instance Profile that grants the EC2 instance necessary permissions (primarily S3 access). * `DEFAULT_EC2_USER`: Change this if the default SSH username for your chosen `AMI_ID` is different from `ubuntu` (e.g., `ec2-user` for Amazon Linux). -### **ETL Configuration (S3 & Olake):** +### **ETL Configuration (S3 & OLake):** -* `S3_BUCKET_NAME`: The name of your S3 bucket where Olake configurations and state will be stored. -* `S3_BUCKET_PREFIX`: The "folder" path (prefix) within your S3 bucket for Olake files (e.g., `olake/projectA/configs/`). Remember the trailing slash if it's part of your intended structure. -* `OLAKE_IMAGE`: The full name of the Olake Docker image you want to use (e.g., `olakego/source-postgres:latest`, `olakego/source-mysql:latest`, `olakego/source-mongodb:latest`). +* `S3_BUCKET_NAME`: The name of your S3 bucket where OLake configurations and state will be stored. +* `S3_BUCKET_PREFIX`: The "folder" path (prefix) within your S3 bucket for OLake files (e.g., `olake/projectA/configs/`). Remember the trailing slash if it's part of your intended structure. +* `OLAKE_IMAGE`: The full name of the OLake Docker image you want to use (e.g., `olakego/source-postgres:latest`, `olakego/source-mysql:latest`, `olakego/source-mongodb:latest`). ### Deploying the DAG to Airflow @@ -325,7 +325,7 @@ The AMI tag we have hardcoded is EKS supported Ubuntu image with containerd and 2. Place the file into the `dags` folder recognized by your Airflow instance. The location of this folder depends on your Airflow setup. 3. Airflow automatically scans this folder. Wait a minute or two, and the DAG named `olake_sync_from_source` should appear in the Airflow UI. You might need to unpause it (toggle button on the left) if it loads in a paused state. -### Running Your Dynamic Olake Sync on EC2 +### Running Your Dynamic OLake Sync on EC2 1. **Access Airflow UI:** Navigate to your Airflow web UI. 2. **Find and Unpause DAG:** Locate the DAG, likely named `olake_sync_from_source` (or whatever `dag_id` you've set). If it's paused, click the toggle to unpause it. @@ -333,15 +333,15 @@ The AMI tag we have hardcoded is EKS supported Ubuntu image with containerd and 4. **Monitor the Run:** Click on the DAG run instance to view its progress in the Graph, Gantt, or Tree view. You will see the following sequence of tasks: * `create_ec2_instance_task`: This task will begin first, using the AWS connection to launch a new EC2 instance according to your DAG's configuration (AMI, instance type, networking, IAM role). Airflow will wait for this instance to be in a 'running' state. * `get_instance_ip_task`: Once the instance is running, this Python task will execute. It queries AWS to get the IP address or DNS name of the new EC2 instance, making it available for the next task. It also includes a pause to allow the SSH service on the new instance to become fully available. - * `run_olake_docker_task`: This is the core task where Olake runs. It will: + * `run_olake_docker_task`: This is the core task where OLake runs. It will: * Connect to the newly created EC2 instance via SSH using the configured SSH connection. * Execute the shell commands defined in `olake_ssh_command` within your DAG. This script prepares the EC2 instance by: * Creating necessary directories. - * Downloading your Olake configuration files and the latest state file from S3. - * Pulling the specified Olake Docker image using `ctr image pull`. - * Running the Olake `sync` process inside a Docker container using `ctr run ... /home/olake sync ...`. + * Downloading your OLake configuration files and the latest state file from S3. + * Pulling the specified OLake Docker image using `ctr image pull`. + * Running the OLake `sync` process inside a Docker container using `ctr run ... /home/olake sync ...`. * Uploading the updated state file back to S3 upon successful completion. - * You can click on this task instance in the Airflow UI and view its logs. These logs will contain the **real-time STDOUT and STDERR** from the SSH session on the EC2 instance, including the output from the Olake Docker container. This is where you'll see Olake's synchronization progress and any potential errors from the Olake process itself. + * You can click on this task instance in the Airflow UI and view its logs. These logs will contain the **real-time STDOUT and STDERR** from the SSH session on the EC2 instance, including the output from the OLake Docker container. This is where you'll see OLake's synchronization progress and any potential errors from the OLake process itself. * `terminate_ec2_instance_task`: After the `run_olake_docker_task` completes (whether it succeeds or fails, due to `trigger_rule=TriggerRule.ALL_DONE`), this final task will execute. It securely terminates the EC2 instance that was launched for this DAG run, ensuring you don't incur unnecessary AWS charges. ![olake-airflow-on-ec2-3](/img/blog/2025/05/olake-airflow-on-ec2-3.webp) diff --git a/blog/2025-08-12-building-open-data-lakehouse-from-scratch.mdx b/blog/2025-08-12-building-open-data-lakehouse-from-scratch.mdx index f03eb665..af1bdc47 100644 --- a/blog/2025-08-12-building-open-data-lakehouse-from-scratch.mdx +++ b/blog/2025-08-12-building-open-data-lakehouse-from-scratch.mdx @@ -39,7 +39,7 @@ Here's where things get really interesting. Unlike traditional ETL pipelines tha ## Step 1: Setting Up OLake - CDC Engine -Olake has one of its unique offerings the OLake UI, which we will be using for our setup. This is a user-friendly control center for managing data pipelines without relying heavily on CLI commands. It allows you to configure sources, destinations, and jobs visually, making the setup more accessible and less error-prone. Many organizations actively use OLake UI to reduce manual CLI work, streamline CDC pipelines, and adopt a no-code-friendly approach. +OLake has one of its unique offerings the OLake UI, which we will be using for our setup. This is a user-friendly control center for managing data pipelines without relying heavily on CLI commands. It allows you to configure sources, destinations, and jobs visually, making the setup more accessible and less error-prone. Many organizations actively use OLake UI to reduce manual CLI work, streamline CDC pipelines, and adopt a no-code-friendly approach. For our setup, we will be working with the OLake UI. We'll start by cloning the repository from GitHub and bringing it up using Docker Compose. Once the UI is running, it will serve as our control hub for creating and monitoring all CDC pipelines. @@ -79,7 +79,7 @@ Once it's running, go ahead at http://localhost:8000, olake-ui and use these cre ![olake-login](/img/blog/2025/10/olake-login.webp) -**You are greeted with Olake UI!** +**You are greeted with OLake UI!** ![olake-ui](/img/blog/2025/10/olakeui.webp) diff --git a/blog/2025-08-29-deploying-olake-on-kubernetes.mdx b/blog/2025-08-29-deploying-olake-on-kubernetes.mdx index 1835ad9a..3ad755ed 100644 --- a/blog/2025-08-29-deploying-olake-on-kubernetes.mdx +++ b/blog/2025-08-29-deploying-olake-on-kubernetes.mdx @@ -162,7 +162,7 @@ global: olake.io/workload-type: "memory-optimized" 456: olake.io/workload-type: "general-purpose" - # Default scheduling behaviour + # Default scheduling behavior 789: {} ``` @@ -172,7 +172,7 @@ A typical enterprise scenario can be considered: a massive customer transactions Without node mapping, both operations might be scheduled on the same node by Kubernetes, causing memory contention. Or worse, the memory-hungry sync job might be put on a small node where an out-of-memory error would cause it to fail. -With JobID-based mapping, the heavy sync is necessarily landed on a node with label `olake.io/workload-type: "memory-optimized"` where completion is achieved in 30 minutes instead of timing out. The other sync job are run happily on smaller, cheaper nodes, finishing without waste. +With JobID-based mapping, the heavy sync is necessarily landed on a node with label `olake.io/workload-type: "memory-optimized"` where completion is achieved in 30 minutes instead of timing out. The other sync jobs are run happily on smaller, cheaper nodes, finishing without waste. ### The Progressive Advantage diff --git a/blog/authors.yml b/blog/authors.yml index d17a5932..ac89773b 100644 --- a/blog/authors.yml +++ b/blog/authors.yml @@ -85,7 +85,7 @@ akshay: duke: page: true name: Duke - title: Olake Maintainer + title: OLake Maintainer image_url: /img/authors/duke.webp socials: linkedin: dukedhal diff --git a/docs/connectors/mongodb/cdc_setup.mdx b/docs/connectors/mongodb/cdc_setup.mdx index 53831a02..b65ab328 100644 --- a/docs/connectors/mongodb/cdc_setup.mdx +++ b/docs/connectors/mongodb/cdc_setup.mdx @@ -51,7 +51,7 @@ This guide covers setting up Change Data Capture (CDC) for both self-hosted Mong **Applicable for both MongoDB (Self-Hosted) and Atlas.** -Olake needs a user that can: +OLake needs a user that can: - Read/write your application database (to ingest data). - Read from the local database (where oplog is stored). diff --git a/docs/connectors/mongodb/index.mdx b/docs/connectors/mongodb/index.mdx index 0599f2a6..12ac232b 100644 --- a/docs/connectors/mongodb/index.mdx +++ b/docs/connectors/mongodb/index.mdx @@ -50,12 +50,12 @@ For local setup, follow **[MongoDB via Docker Compose](/docs/connectors/mongodb/ - + ### 1. Navigate to the Source Configuration Page 1. Complete the [OLake UI Setup Guide](/docs/getting-started/olake-ui) -2. After logging in to the OlakeUI, select the `Sources` tab from the left sidebar. +2. After logging in to the OLake UI, select the `Sources` tab from the left sidebar. 3. Click **`Create Source`** on the top right corner. 4. Select **MongoDB** from the connector dropdown 5. Provide a name for this source. @@ -63,7 +63,7 @@ For local setup, follow **[MongoDB via Docker Compose](/docs/connectors/mongodb/ ### 2. Provide Configuration Details - Enter MongoDB credentials. -![Olake UI MongoDB Source Setup](/img/connectors/mongodb/mongodb-ui-setup.webp) +![OLake UI MongoDB Source Setup](/img/connectors/mongodb/mongodb-ui-setup.webp) | Field | Description | Example Value | | ----------------------------- | -------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | @@ -86,11 +86,11 @@ For local setup, follow **[MongoDB via Docker Compose](/docs/connectors/mongodb/ - + ### 1. Create Configuration File - - Once the Olake CLI is setup, create a folder to store configuration files such as `source.json` and `destination.json`. + - Once the OLake CLI is setup, create a folder to store configuration files such as `source.json` and `destination.json`. ### 2. Provide Configuration Details diff --git a/docs/connectors/mysql/index.mdx b/docs/connectors/mysql/index.mdx index f565e430..8e248063 100644 --- a/docs/connectors/mysql/index.mdx +++ b/docs/connectors/mysql/index.mdx @@ -48,7 +48,7 @@ The OLake MySQL Source connector supports multiple sync modes. It also offers fe - + ### 1. Navigate to the Source Configuration Page diff --git a/docs/connectors/oracle/index.mdx b/docs/connectors/oracle/index.mdx index bd6b32ae..15b22220 100644 --- a/docs/connectors/oracle/index.mdx +++ b/docs/connectors/oracle/index.mdx @@ -28,12 +28,12 @@ The OLake Oracle Source connector supports two synchronization modes. It offers - + ### 1. Navigate to the Source Configuration Page 1. Complete the [OLake UI Setup Guide](/docs/getting-started/olake-ui) -2. After logging in to the OlakeUI, select the `Sources` tab from the left sidebar. +2. After logging in to the OLake UI, select the `Sources` tab from the left sidebar. 3. Click **`Create Source`** on the top right corner. 4. Select **Oracle** from the connector dropdown 5. Provide a name for this source. @@ -75,7 +75,7 @@ Oracle Database automatically converts unquoted lowercase usernames to uppercase ### 1. Create Configuration File - - Once the Olake CLI is setup, create a folder to store configuration files such as `source.json` and `destination.json`. + - Once the OLake CLI is setup, create a folder to store configuration files such as `source.json` and `destination.json`. The `source.json` file for oracle must contain these mandatory fields. diff --git a/docs/connectors/postgres/index.mdx b/docs/connectors/postgres/index.mdx index 57edab3d..79af08ea 100644 --- a/docs/connectors/postgres/index.mdx +++ b/docs/connectors/postgres/index.mdx @@ -38,12 +38,12 @@ The OLake Postgres Source connector supports multiple synchronization modes. It - + ### 1. Navigate to the Source Configuration Page 1. Complete the [OLake UI Setup Guide](/docs/getting-started/olake-ui) -2. After logging in to the OlakeUI, select the `Sources` tab from the left sidebar. +2. After logging in to the OLake UI, select the `Sources` tab from the left sidebar. 3. Click **`Create Source`** on the top right corner. 4. Select **Postgres** from the connector dropdown 5. Provide a name for this source. @@ -82,7 +82,7 @@ The OLake Postgres Source connector supports multiple synchronization modes. It ### 1. Create Configuration File - - Once the Olake CLI is setup, create a folder to store configuration files such as `source.json` and `destination.json`. + - Once the OLake CLI is setup, create a folder to store configuration files such as `source.json` and `destination.json`. The `source.json` file for postgres must contain these mandatory fields. diff --git a/docs/getting-started/playground.mdx b/docs/getting-started/playground.mdx index 9c111811..8704f0a9 100644 --- a/docs/getting-started/playground.mdx +++ b/docs/getting-started/playground.mdx @@ -96,7 +96,7 @@ This will spin up: Once the stack is up and running (especially after init-mysql-tasks and olake-app are healthy/started): -- Olake Application UI: http://localhost:8000 +- OLake Application UI: http://localhost:8000 Default credentials: @@ -118,9 +118,9 @@ Once the stack is up and running (especially after init-mysql-tasks and olake-ap SELECT * FROM weather LIMIT 10; ``` This will display the first 10 rows of the weather table. -### 5. Interacting with Olake +### 5. Interacting with OLake - 1. Log in to the Olake UI at http://localhost:8000 using the default credentials. + 1. Log in to the OLake UI at http://localhost:8000 using the default credentials. 2. Create and Configure a Job: Create a Job to define and run the data pipeline: On the main page, click on the "Create your first Job" button @@ -162,7 +162,7 @@ Once the stack is up and running (especially after init-mysql-tasks and olake-ap * **Iceberg Database (example)**: weather - * **S3 Endpoint (for Iceberg data files written by Olake workers)**: http://host.docker.internal:9090 + * **S3 Endpoint (for Iceberg data files written by OLake workers)**: http://host.docker.internal:9090 * **AWS Region**: us-east-1 diff --git a/docs/install/docker-cli.mdx b/docs/install/docker-cli.mdx index 5fe0224a..5411be95 100644 --- a/docs/install/docker-cli.mdx +++ b/docs/install/docker-cli.mdx @@ -233,7 +233,7 @@ The `stats.json` file remains available after sync completion. ## Logs -During a sync, Olake automatically creates an olake.log file inside a folder named `sync_` (for example: sync_2025-02-17_11-41-40). +During a sync, OLake automatically creates an olake.log file inside a folder named `sync_` (for example: sync_2025-02-17_11-41-40). This folder is created in the same directory as the configuration files. - The olake.log file contains a complete record of all logs generated while the command is running. diff --git a/docs/release/v0.1.0-v0.1.1.mdx b/docs/release/v0.1.0-v0.1.1.mdx index 06319d27..95f0ba4c 100644 --- a/docs/release/v0.1.0-v0.1.1.mdx +++ b/docs/release/v0.1.0-v0.1.1.mdx @@ -1,4 +1,4 @@ -# Olake (v0.1.0 โ€“ v0.1.1) +# OLake (v0.1.0 โ€“ v0.1.1) June 13 โ€“ June 18, 2025 ## ๐ŸŽฏ What's New @@ -17,7 +17,7 @@ June 13 โ€“ June 18, 2025 1. **Driver Releaser -**
Launches the OLake Driver Releaser tool for packaging and distributing OLake connectors, making driver updates seamless across environments. -2. **Strict CDC Sync Mode -**
Adds a new mode that applies only change events and skips any fullโ€‘refresh backfill during syncs, guaranteeing CDCโ€‘only behaviour. This reduces load on sources/targets and avoids accidental reโ€‘snapshots in continuous pipelines. +2. **Strict CDC Sync Mode -**
Adds a new mode that applies only change events and skips any fullโ€‘refresh backfill during syncs, guaranteeing CDCโ€‘only behavior. This reduces load on sources/targets and avoids accidental reโ€‘snapshots in continuous pipelines. 3. **Discover with Merge -**
Schema discovery now merges results into an existing streams.json so prior selections and settings are preserved while new streams are added. This minimizes manual edits when onboarding new tables or evolving schemas. diff --git a/docs/release/v0.1.6-v0.1.8.mdx b/docs/release/v0.1.6-v0.1.8.mdx index 694896fa..f2059bf2 100644 --- a/docs/release/v0.1.6-v0.1.8.mdx +++ b/docs/release/v0.1.6-v0.1.8.mdx @@ -7,7 +7,7 @@ July 17 โ€“ July 30, 2025 1. **Incremental Sync: MongoDB and Oracle -**
Added incremental synchronisation support for MongoDB and Oracle sources. This adds changeโ€‘only replication for both the sources so OLake transfers new/updated documents since the last run, reducing latency and data volume for recurring pipelines. -2. **Oracle Connector Filter & Chunking -**
Added filter support and optimised chunking strategy for the Oracle connector. This ensures query-level filtering and an optimized chunking strategy to the Oracle connector, ensuring only relevant rows are fetched and evenly sized data chunks maximize parallel throughput. +2. **Oracle Connector Filter & Chunking -**
Added filter support and optimized chunking strategy for the Oracle connector. This ensures query-level filtering and an optimized chunking strategy to the Oracle connector, ensuring only relevant rows are fetched and evenly sized data chunks maximize parallel throughput. 3. **Oracle multi cursor support for incremental Sync -**
OLake incremental sync can now be configured with a primary and secondary cursor column of the same datatype, where the secondary is used only if the primary cursor value is NULL, reducing missed changes in sparse or nullโ€‘heavy tables. @@ -29,4 +29,4 @@ July 17 โ€“ July 30, 2025 2. **Discovery Cursor Fix -**
Fixes merging of cursor fields in the new discover flow so schema and cursor metadata are recorded consistently. This avoids missing or duplicated cursor information when building stream definitions. -3. **Postgres CDC Reliability -**
Improved Postgres CDC behaviour by advancing LSN during full load when cache is invalid. \ No newline at end of file +3. **Postgres CDC Reliability -**
Improved Postgres CDC behavior by advancing LSN during full load when cache is invalid. \ No newline at end of file diff --git a/docs/understanding/terminologies/olake.mdx b/docs/understanding/terminologies/olake.mdx index 28066ac0..0195e675 100644 --- a/docs/understanding/terminologies/olake.mdx +++ b/docs/understanding/terminologies/olake.mdx @@ -115,7 +115,7 @@ Normalization must be enabled at the schema configuration step per table or stre :::
Olake Partition
@@ -125,14 +125,14 @@ Normalization must be enabled at the schema configuration step per table or stre
Olake Partition output
*After state: Sync done, when normalization is on.*
Olake Partition output
@@ -145,7 +145,7 @@ Normalization must be enabled at the schema configuration step per table or stre ### 2. Sync Modes -Sync modes in Olake define the strategy used to replicate data from a source system to a destination. Each mode represents a different approach to data synchronization, with specific behaviours, guarantees, and performance characteristics. +Sync modes in OLake define the strategy used to replicate data from a source system to a destination. Each mode represents a different approach to data synchronization, with specific behaviors, guarantees, and performance characteristics. OLake supports 4 distinct sync modes: @@ -159,7 +159,7 @@ OLake supports 4 distinct sync modes: A delta-sync strategy that only processes new or changed records since the last sync. Requires primary (mandatory) and secondary cursor (optional) fields for change detection. Similar to CDC sync, an initial full-refresh takes place in this as well. :::info Cursor fields are columns in the source table used to track the last synced records. \ - Olake allows setting up to two cursor fields: + OLake allows setting up to two cursor fields: - **Primary Cursor:** This is the mandatory cursor field through which the changes in the records are captured and compared. - **Secondary Cursor:** In case primary cursor's value is null, then the value of secondary cursor is considered if provided. ::: @@ -169,7 +169,7 @@ OLake supports 4 distinct sync modes:
Olake Partition output
@@ -192,7 +192,7 @@ The data filter feature allows selective ingestion from source databases by appl
Olake Partition output
@@ -208,7 +208,7 @@ For more information, refer to [Schema Evolution Feature](/docs/features?tab=sch
Olake Partition output
@@ -229,14 +229,14 @@ Unlike traditional systems like Hive, Iceberg's approach uses "hidden partitioni *For input as given below,*
Olake Partition
-*As seen below, partition on created_at field, tranformation using 'day' has been done.* +*As seen below, partition on created_at field, transformation using 'day' has been done.*
Olake Partition output
@@ -253,7 +253,7 @@ Unlike traditional systems like Hive, Iceberg's approach uses "hidden partitioni ### 6. Job Configuration -The job configuration property refers to the options that defines jobโ€™s name, schedule, and execution in the Olakeโ€™s system. \ +The job configuration property refers to the options that defines jobโ€™s name, schedule, and execution in the OLakeโ€™s system. \ User has to start with job creation, which will be followed with source configuration, then destination configuration, checking and enabling relevant streams from the schema for sync, and then finally in job configuration, job name and frequency has to be set. - **Frequency Options:** @@ -276,7 +276,7 @@ User has to start with job creation, which will be followed with source configur
Olake Partition output
diff --git a/docs/writers/iceberg/azure.mdx b/docs/writers/iceberg/azure.mdx index 33ee04d2..56828880 100644 --- a/docs/writers/iceberg/azure.mdx +++ b/docs/writers/iceberg/azure.mdx @@ -35,7 +35,7 @@ To follow this guide, you will need: First, we need to set up the storage foundation and credentials on Azure. All these steps are performed in the Azure Portal. 1. **Create Service Principal:** Create a new **App Registration**. This will be our application's identity. - **Name**: `Olake Warehouse (Development)` + **Name**: `OLake Warehouse (Development)` **Redirect URI**: Leave empty a. When the App Registration is created, select "**Manage**" -> "**Certificates & secrets**" and create a "**New client secret**". Note down the secrets "**Value**". @@ -75,7 +75,7 @@ Now, let's get the Lakekeeper REST Catalog running using its official Docker Com This will start the Lakekeeper REST catalog service and its required PostgreSQL backend. Lakekeeper's API will now be available on your local machine at `http://localhost:8181`. ### Step 3: Add the Azure Warehouse to Lakekeeper -Before Olake can use Lakekeeper, we must configure Lakekeeper to be aware of our ADLS Gen2 storage. +Before OLake can use Lakekeeper, we must configure Lakekeeper to be aware of our ADLS Gen2 storage. 1. **Access the Lakekeeper UI:** Open your browser and navigate to `http://localhost:8181`. 2. **Add a New Warehouse:** Find the section for adding a new storage profile or warehouse. @@ -83,7 +83,7 @@ Before Olake can use Lakekeeper, we must configure Lakekeeper to be aware of our * **Warehouse Name:** `olake_warehouse` * **Storage Type:** Select "Azure". * **Credential Type:** Select "Client Credentials". - * **Client ID:** The `Application (client) ID` of the `Olake Warehouse (Development)` App Registration from Step 1. + * **Client ID:** The `Application (client) ID` of the `OLake Warehouse (Development)` App Registration from Step 1. * **Client Secret:** The "Value" of the client secret that we noted down previously in Step 1. * **Tenant ID:** The `Directory (tenant) ID` from the Applications Overview page from Step 1. * **Account Name:** olakehouse @@ -94,7 +94,7 @@ Before Olake can use Lakekeeper, we must configure Lakekeeper to be aware of our Lakekeeper Warehouse Configuration ### Step 4: Set Up the OLake Environment -Now, let's get the Olake UI running using its official Docker Compose setup. +Now, let's get the OLake UI running using its official Docker Compose setup. 1. **Get the OLake Docker Compose file:** Follow the instructions at the [OLake Getting Started](/docs/getting-started/quickstart) to get the `docker-compose.yml` file. 2. **Start OLake:** Follow the steps from the docs and remember to modify the directory in `docker-compose.yml` where OLake's persistent data and configuration will be stored, once done, run the following command: @@ -105,10 +105,10 @@ Now, let's get the Olake UI running using its official Docker Compose setup. This will start the OLake UI along with Temporal and PostgreSQL services. OLakeโ€™s UI will now be available on your local machine at http://localhost:8000. -### Step 5: Configure the Iceberg Destination in the Olake UI -With all services running, we will now connect Olake to Lakekeeper and Azure. +### Step 5: Configure the Iceberg Destination in the OLake UI +With all services running, we will now connect OLake to Lakekeeper and Azure. -1. **Log in to Olake UI:** Open your browser and navigate to http://localhost:8000. +1. **Log in to OLake UI:** Open your browser and navigate to http://localhost:8000. 2. **Navigate to Destinations:** Go to the **Destinations** page and click **"Create Destination"**, then select **Apache Iceberg**. 3. **Fill in the Endpoint config:** * **Catalog Type**: `REST Catalog` @@ -118,7 +118,7 @@ With all services running, we will now connect Olake to Lakekeeper and Azure. * **S3 Endpoint**: `https://olakehouse.dfs.core.windows.net` * **AWS Access Key**: Leave empty. * **AWS Secret Key**: Leave empty. -4. **Save and Test** the destination to ensure Olake can communicate with both Lakekeeper and Azure. +4. **Save and Test** the destination to ensure OLake can communicate with both Lakekeeper and Azure. ![azure-adls-6](/img/docs/adls/azure-adls-6.webp) diff --git a/docs/writers/parquet/troubleshoot.mdx b/docs/writers/parquet/troubleshoot.mdx index 607a18ff..23283df4 100644 --- a/docs/writers/parquet/troubleshoot.mdx +++ b/docs/writers/parquet/troubleshoot.mdx @@ -30,7 +30,7 @@ Symptom: failed to write test file to S3: AccessDenied: Access Denied OR failed ``` **Cause:** -- The AWS credentials supplied to Olake do **not** have the minimum set of S3 actions or the bucket ARN is incorrect. +- The AWS credentials supplied to OLake do **not** have the minimum set of S3 actions or the bucket ARN is incorrect. **Resolution:** - Make sure the IAM user / role has the JSON policy shown in the [IAM Permissions](/docs/writers/parquet/permission) page attached. diff --git a/iceberg/2025-02-21-mor-vs-cow.mdx b/iceberg/2025-02-21-mor-vs-cow.mdx index 1a64bb66..784e7e0b 100644 --- a/iceberg/2025-02-21-mor-vs-cow.mdx +++ b/iceberg/2025-02-21-mor-vs-cow.mdx @@ -115,7 +115,7 @@ While positional deletes are efficient to read but slower to write, equality del ### Performance Considerations -The key to maintaining good performance with Merge-on-Read tables lies in regular compaction. Over time, as more changes accumulate in delete files and new data files, query performance can start to degrade. Compaction helps by periodically combining all these changes with the base files, creating a fresh, optimised set of data files. +The key to maintaining good performance with Merge-on-Read tables lies in regular compaction. Over time, as more changes accumulate in delete files and new data files, query performance can start to degrade. Compaction helps by periodically combining all these changes with the base files, creating a fresh, optimized set of data files. ### When to Choose Merge-on-Read diff --git a/iceberg/2025-05-08-olake-iceberg-athena.mdx b/iceberg/2025-05-08-olake-iceberg-athena.mdx index 7af9a114..29459da9 100644 --- a/iceberg/2025-05-08-olake-iceberg-athena.mdx +++ b/iceberg/2025-05-08-olake-iceberg-athena.mdx @@ -20,7 +20,7 @@ Let's dive in. ## Why Iceberg ? -When designing Olake, choosing Apache Iceberg as the destination format was a clear decision. Why? Because Iceberg isn't just another way to put files in S3; it's a leap forward that brings much-needed reliability and advanced capabilities to data lakes. Many in the industry see iceberg as the future standard for data engineering because it tackles fundamental challenges when working with data on object storage head-on: +When designing OLake, choosing Apache Iceberg as the destination format was a clear decision. Why? Because Iceberg isn't just another way to put files in S3; it's a leap forward that brings much-needed reliability and advanced capabilities to data lakes. Many in the industry see iceberg as the future standard for data engineering because it tackles fundamental challenges when working with data on object storage head-on: ### 1. Reliable Transactions: @@ -57,7 +57,7 @@ Together, these tools allow you to build an end-to-end pipeline from your databa In this Blog we will be diving into the following sections 1. Configuring AWS Glue catalog & S3 permissions -2. Replicating database data to Apache Iceberg using Olake +2. Replicating database data to Apache Iceberg using OLake 3. Querying iceberg data using Amazon Athena ## Step 1: Configure AWS Glue Catalog and S3 Permissions @@ -114,7 +114,7 @@ By setting up these basic AWS resources and granting OLake the necessary permiss OLake integrates with AWS Glue to automatically register iceberg tables, making them queryable in Athena. -## Step 2: Data Ingestion from Database to Apache Iceberg Using Olake +## Step 2: Data Ingestion from Database to Apache Iceberg Using OLake ### 2.1 Install Docker Ensure Docker is installed and running on your local or cloud environment. OLake is Docker-based, making setup fast and portable @@ -162,7 +162,7 @@ Once OLake has synced data into Iceberg tables and registered them with Glue, yo - On the left sidebar under Data Source, select `AwsDataCatalog` -- Expand the relevant database (as mentioned in the Olake writer cofig file) and tables section to see the Iceberg tables synced by Olake +- Expand the relevant database (as mentioned in the OLake writer cofig file) and tables section to see the Iceberg tables synced by OLake ![olake-iceberg-athena-2](/img/blog/2025/05/olake-iceberg-athena-2.webp) diff --git a/iceberg/2025-05-28-olake-glue-snowflake.mdx b/iceberg/2025-05-28-olake-glue-snowflake.mdx index feb9beeb..6cd40d22 100644 --- a/iceberg/2025-05-28-olake-glue-snowflake.mdx +++ b/iceberg/2025-05-28-olake-glue-snowflake.mdx @@ -121,7 +121,7 @@ By setting up these basic AWS resources and granting OLake the necessary permiss OLake integrates with AWS Glue to automatically register Iceberg tables, making them queryable in Snowflake. -### **Step 2: Data Ingestion from Database to Apache Iceberg Using Olake** +### **Step 2: Data Ingestion from Database to Apache Iceberg Using OLake** **2.1 Install Docker** @@ -140,7 +140,7 @@ Youโ€™ll need two configuration files * [source.json](https://olake.io/docs/connectors/postgres/config) file with your database connection details * [destination.json](https://olake.io/docs/writers/iceberg/catalog/glue) file with your Writer ([Apache Iceberg](https://olake.io/docs/writers/iceberg/catalog/overview)) connection details -You can read here for detailed version[ Olake Configuration](https://olake.io/docs/getting-started/postgres) +You can read here for detailed version[ OLake Configuration](https://olake.io/docs/getting-started/postgres) **2.4 Run the[ Discover](https://olake.io/docs/connectors/postgres/overview) command** to generate a `streams.json` file, which contains the list of available data streams from your source database. You can refer to the official[ OLake documentation](https://olake.io/docs) for complete documentation @@ -163,7 +163,7 @@ This approach is especially useful for organizations that maintain large dataset Similarly, Iceberg tables store both data and metadata externally, typically in cloud object storage. With Snowflakeโ€™s support for external volumes, you can query Iceberg tables directly, leveraging Snowflakeโ€™s performance while benefiting from open table formats and flexible storage. You can read more here[ External Tables in Snowflake](https://docs.snowflake.com/en/user-guide/tables-external-intro). -Since we're using Olake to ingest data into Iceberg and managing metadata through the AWS Glue Catalog, Snowflake recognises these as external tables. To query them, we first need to mount the underlying storage as an external volume in Snowflake. +Since we're using OLake to ingest data into Iceberg and managing metadata through the AWS Glue Catalog, Snowflake recognises these as external tables. To query them, we first need to mount the underlying storage as an external volume in Snowflake. **3.1. Configure an external volume** diff --git a/iceberg/2025-06-24-iceberg-partitioning-and-writing-strategies.mdx b/iceberg/2025-06-24-iceberg-partitioning-and-writing-strategies.mdx index 54f5b394..9aabc2c6 100644 --- a/iceberg/2025-06-24-iceberg-partitioning-and-writing-strategies.mdx +++ b/iceberg/2025-06-24-iceberg-partitioning-and-writing-strategies.mdx @@ -9,7 +9,7 @@ tags: [iceberg, partitioning] ![iceberg-partitioning-and-writing-strategies](/img/blog/cover/iceberg-partitioning-and-writing-strategies-cover.webp) -Ever wondered how partitioning in big table formats like Apache Iceberg works out? And what partitioned writing strategies Iceberg can assist you with during ETL? Iceberg handles data partitioning very [differently from any other data lake format (hive paritioning)](/iceberg/hive-partitioning-vs-iceberg-partitioning). +Ever wondered how partitioning in big table formats like Apache Iceberg works out? And what partitioned writing strategies Iceberg can assist you with during ETL? Iceberg handles data partitioning very [differently from any other data lake format (hive partitioning)](/iceberg/hive-partitioning-vs-iceberg-partitioning). In this blog, we will dive into: @@ -56,7 +56,7 @@ AND region IN ('US', 'EU') AND order_amount > 1000 The query engine reads the latest `metadata.json`, and identifies the current `snapshot-id` and retrieves the snapshotโ€™s `manifest-list` location. Then loads the partition specification to understand the partitioning, here `[order_date, region]` from the metadata along with the schema of the table. -- Here `0007-hash.metadata.json` depicts the most recent state of the table, and in case you are familiar with using Apache Iceberg, you should know that `0000-hash.metadata.json` is created when the table is first initialised. +- Here `0007-hash.metadata.json` depicts the most recent state of the table, and in case you are familiar with using Apache Iceberg, you should know that `0000-hash.metadata.json` is created when the table is first initialized. Then it moves on to read the manifest-list, its current snapshot, here `snap-0001-hash.avro`. It provides it with the high-level statistics for each manifest file. The engine examines partition bounds i.e., `lower_bound` and `upper_bound`, across all partitions in that manifest. @@ -81,7 +81,7 @@ Here, Again, you see that without even reading individual `data-file-n.parquet` file, we decide whether to read it or to skip it. This is called **Partition Pruning**, an extremely important partitioning feature of Iceberg as it gifts it with huge efficiency gains. As, here itself, you can see it reads only 1 data file out of all 4 data files, trust me, this comes out to be a huge boon in production level. -Along with it, there is a very important concept of **Scan Planning** with filters like **bloom** which transforms the SQL query into a highly optimised, parallel execution plan, but that would go out of scope for this blog, so wonโ€™t be discussing about it. +Along with it, there is a very important concept of **Scan Planning** with filters like **bloom** which transforms the SQL query into a highly optimized, parallel execution plan, but that would go out of scope for this blog, so wonโ€™t be discussing about it. ## Real World Scale: Manifest Explosion @@ -111,7 +111,7 @@ It creates a dedicated Rolling File Writer for each `PartitionKey` and maps it i ![iceberg-partitioning-and-writing-strategies-4](/img/blog/2025/06/iceberg-partitioning-and-writing-strategies-4.webp) -It is a memory-efficient, sequential partition writing strategy that operates under the fundamental constraint of pre-clustered data ordering. This represents a low-memory, high-throughput approach optimised for scenarios where incoming data is already sorted by partition specification and partition values, enabling single-active-writer semantics with minimal memory footprint. +It is a memory-efficient, sequential partition writing strategy that operates under the fundamental constraint of pre-clustered data ordering. This represents a low-memory, high-throughput approach optimized for scenarios where incoming data is already sorted by partition specification and partition values, enabling single-active-writer semantics with minimal memory footprint. With each spec, all records belonging to the same partition value combination must be contiguous. It has O(1) memory usage regardless of partition count with only one file handle and buffer active at any time and supports PartitionSpec evolution through multi-spec capability, but it requires upstream clustering/sorting of data and cannot handle streaming data. @@ -139,7 +139,7 @@ Thus summarising, Partitioning is the heart of Icebergโ€™s performance, scalability, and metadata efficiency. -Whether you're querying massive datasets or writing from high-volume streams, Iceberg's PartitionSpec, Manifest Lists, and Writing Strategies give you the tools to build highly optimised data pipelines. +Whether you're querying massive datasets or writing from high-volume streams, Iceberg's PartitionSpec, Manifest Lists, and Writing Strategies give you the tools to build highly optimized data pipelines. And remember: all of this happens _without you having to worry about folders, MSCK repairs, or schema breakage_. diff --git a/src/data/meetup/6th-meetup.json b/src/data/meetup/6th-meetup.json index 1f7898a1..0cc9b81c 100644 --- a/src/data/meetup/6th-meetup.json +++ b/src/data/meetup/6th-meetup.json @@ -1,5 +1,5 @@ { - "summary": "The sixth OLake community meetup (28 April 2025) centred on a real-world production story from PhysicsWallah and a deeper dive into OLakeโ€™s roadmap. Guest speaker Adish Jain walked the community through PhysicsWallah migration from a Redshift warehouse to an Iceberg-based lakehouse, the pains they faced with Debezium, and how OLake solved them with faster, resumable full loads, direct Iceberg ingestion, and automatic schema evolution. A live demo showed MongoDB-to-Iceberg ingestion running in Kubernetes. Shubham Baldava then unpacked OLakeโ€™s Golang + Java architecture, explained plans to shift the Iceberg writer to Go/Rust for lower memory use, previewed an upcoming UI, and announced mid-level SMT transformations arriving within three months.", + "summary": "The sixth OLake community meetup (28 April 2025) centered on a real-world production story from PhysicsWallah and a deeper dive into OLakeโ€™s roadmap. Guest speaker Adish Jain walked the community through PhysicsWallah migration from a Redshift warehouse to an Iceberg-based lakehouse, the pains they faced with Debezium, and how OLake solved them with faster, resumable full loads, direct Iceberg ingestion, and automatic schema evolution. A live demo showed MongoDB-to-Iceberg ingestion running in Kubernetes. Shubham Baldava then unpacked OLakeโ€™s Golang + Java architecture, explained plans to shift the Iceberg writer to Go/Rust for lower memory use, previewed an upcoming UI, and announced mid-level SMT transformations arriving within three months.", "chaptersAndTopics": [ { "title": "Introduction and Agenda", diff --git a/src/pages/webinar/w-8-distributed-stream-processing-in-practice.tsx b/src/pages/webinar/w-8-distributed-stream-processing-in-practice.tsx index 2e06043e..c4723a5c 100644 --- a/src/pages/webinar/w-8-distributed-stream-processing-in-practice.tsx +++ b/src/pages/webinar/w-8-distributed-stream-processing-in-practice.tsx @@ -22,7 +22,7 @@ const hosts = [ { name: "Hasan Geren", role: "Data Engineer @ ProcurePro", - bio: "Hasan's career includes Data Engineering, where he has: โ€ข Designed and optimised ๐˜€๐—ฐ๐—ฎ๐—น๐—ฎ๐—ฏ๐—น๐—ฒ ๐—ฑ๐—ฎ๐˜๐—ฎ๐—ฏ๐—ฎ๐˜€๐—ฒ๐˜€ and cloud storage architectures. โ€ข Built ๐—น๐—ผ๐˜„-๐—น๐—ฎ๐˜๐—ฒ๐—ป๐—ฐ๐˜† ๐—ฑ๐—ฎ๐˜๐—ฎ ๐—ฝ๐—ถ๐—ฝ๐—ฒ๐—น๐—ถ๐—ป๐—ฒ๐˜€ to support real-time applications and analytics dashboards. โ€ข Developed AI/ML-based solutions, including ๐—Ÿ๐—ฆ๐—ง๐—  ๐—บ๐—ผ๐—ฑ๐—ฒ๐—น๐˜€ and ๐—ฟ๐—ฒ๐—ฐ๐—ผ๐—บ๐—บ๐—ฒ๐—ป๐—ฑ๐—ฎ๐˜๐—ถ๐—ผ๐—ป ๐˜€๐˜†๐˜€๐˜๐—ฒ๐—บ๐˜€ to enhance user engagement. โ€ข Collaborated across teams to drive actionable insights, ensuring data solutions align with business goals.", + bio: "Hasan's career includes Data Engineering, where he has: โ€ข Designed and optimized ๐˜€๐—ฐ๐—ฎ๐—น๐—ฎ๐—ฏ๐—น๐—ฒ ๐—ฑ๐—ฎ๐˜๐—ฎ๐—ฏ๐—ฎ๐˜€๐—ฒ๐˜€ and cloud storage architectures. โ€ข Built ๐—น๐—ผ๐˜„-๐—น๐—ฎ๐˜๐—ฒ๐—ป๐—ฐ๐˜† ๐—ฑ๐—ฎ๐˜๐—ฎ ๐—ฝ๐—ถ๐—ฝ๐—ฒ๐—น๐—ถ๐—ป๐—ฒ๐˜€ to support real-time applications and analytics dashboards. โ€ข Developed AI/ML-based solutions, including ๐—Ÿ๐—ฆ๐—ง๐—  ๐—บ๐—ผ๐—ฑ๐—ฒ๐—น๐˜€ and ๐—ฟ๐—ฒ๐—ฐ๐—ผ๐—บ๐—บ๐—ฒ๐—ป๐—ฑ๐—ฎ๐˜๐—ถ๐—ผ๐—ป ๐˜€๐˜†๐˜€๐˜๐—ฒ๐—บ๐˜€ to enhance user engagement. โ€ข Collaborated across teams to drive actionable insights, ensuring data solutions align with business goals.", image: "/img/authors/hasan.webp", linkedin: "https://www.linkedin.com/in/hasan-geren/", }, diff --git a/src/theme/DocPaginator/footerNavigations.js b/src/theme/DocPaginator/footerNavigations.js index 206eb284..0e960293 100644 --- a/src/theme/DocPaginator/footerNavigations.js +++ b/src/theme/DocPaginator/footerNavigations.js @@ -418,7 +418,7 @@ export const paginationConfig = { // Issues and PRs '/docs/community/issues-and-prs': { previous: { - title: 'Contribute to Olake', + title: 'Contribute to OLake', permalink: '/docs/community/contributing' }, next: { diff --git a/static/img/community/setting-up-a-dev-env/architecture_diagram.png b/static/img/community/setting-up-a-dev-env/architecture_diagram.png new file mode 100644 index 00000000..e991f720 Binary files /dev/null and b/static/img/community/setting-up-a-dev-env/architecture_diagram.png differ diff --git a/static/img/community/setting-up-a-dev-env/olake_architecture.png b/static/img/community/setting-up-a-dev-env/olake_architecture.png new file mode 100644 index 00000000..347c16a6 Binary files /dev/null and b/static/img/community/setting-up-a-dev-env/olake_architecture.png differ