Basic Usage

Basic Usage¶

A single end-to-end example showing the two core operations — scrape a page, then crawl a small site — in every supported binding. Each tab is the canonical "hello world" for that language: create_engine, scrape(engine, url), CrawlConfig(max_depth=1, max_pages=5), crawl(engine, url).

PythonTypeScriptRubyGoJavaC#PHPElixirWASMDartKotlin (Android)SwiftZig

Python

import asyncio

from kreuzcrawl import CrawlConfig, create_engine, crawl, scrape


async def main() -> None:
    # Simplest case: scrape a single page with default settings.
    engine = create_engine()
    result = await scrape(engine, "https://example.com/")
    print(f"Title: {result.metadata.title}")
    print(f"Status: {result.status_code}")
    print(f"Links found: {len(result.links)}")

    # Crawl from a seed URL, limited to one hop and a handful of pages.
    crawl_engine = create_engine(CrawlConfig(max_depth=1, max_pages=5))
    crawl_result = await crawl(crawl_engine, "https://en.wikipedia.org/wiki/Web_scraping")
    print(f"Pages crawled: {len(crawl_result.pages)}")


if __name__ == "__main__":
    asyncio.run(main())

TypeScript

import { crawl, createEngine, scrape } from "@kreuzberg/kreuzcrawl";

async function main(): Promise<void> {
  // Simplest case: scrape a single page with default settings.
  const engine = createEngine();
  const result = await scrape(engine, "https://example.com/");
  console.log(`Title: ${result.metadata?.title ?? ""}`);
  console.log(`Status: ${result.statusCode}`);
  console.log(`Links found: ${result.links?.length ?? 0}`);

  // Crawl from a seed URL, limited to one hop and a handful of pages.
  const crawlEngine = createEngine({ maxDepth: 1, maxPages: 5 });
  const crawlResult = await crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping");
  console.log(`Pages crawled: ${crawlResult.pages?.length ?? 0}`);
}

main().catch((error) => {
  console.error(error);
  process.exit(1);
});

Ruby

require "kreuzcrawl"

# Simplest case: scrape a single page with default settings.
engine = Kreuzcrawl.create_engine
result = Kreuzcrawl.scrape(engine, "https://example.com/")
puts "Title: #{result.metadata.title}"
puts "Status: #{result.status_code}"
puts "Links found: #{result.links.length}"

# Crawl from a seed URL, limited to one hop and a handful of pages.
config = Kreuzcrawl::CrawlConfig.new(max_depth: 1, max_pages: 5)
crawl_engine = Kreuzcrawl.create_engine(config)
crawl_result = Kreuzcrawl.crawl(crawl_engine, "https://en.wikipedia.org/wiki/Web_scraping")
puts "Pages crawled: #{crawl_result.pages.length}"

Go

package main

import (
    "fmt"
    "log"

    kreuzcrawl "github.com/kreuzberg-dev/kreuzcrawl/packages/go"
)

func main() {
    // Simplest case: scrape a single page with default settings.
    engine, err := kreuzcrawl.CreateEngine(nil)
    if err != nil {
        log.Fatalf("create engine: %v", err)
    }

    result, err := kreuzcrawl.Scrape(engine, "https://example.com/")
    if err != nil {
        log.Fatalf("scrape: %v", err)
    }
    title := ""
    if result.Metadata.Title != nil {
        title = *result.Metadata.Title
    }
    fmt.Printf("Title: %s\n", title)
    fmt.Printf("Status: %d\n", result.StatusCode)
    fmt.Printf("Links found: %d\n", len(result.Links))

    // Crawl from a seed URL, limited to one hop and a handful of pages.
    config := kreuzcrawl.NewCrawlConfig(
        kreuzcrawl.WithCrawlConfigMaxDepth(1),
        kreuzcrawl.WithCrawlConfigMaxPages(5),
    )
    crawlEngine, err := kreuzcrawl.CreateEngine(config)
    if err != nil {
        log.Fatalf("create crawl engine: %v", err)
    }
    crawlResult, err := kreuzcrawl.Crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping")
    if err != nil {
        log.Fatalf("crawl: %v", err)
    }
    fmt.Printf("Pages crawled: %d\n", len(crawlResult.Pages))
}

Java

import java.util.Optional;

import dev.kreuzberg.kreuzcrawl.CrawlConfig;
import dev.kreuzberg.kreuzcrawl.CrawlEngineHandle;
import dev.kreuzberg.kreuzcrawl.CrawlResult;
import dev.kreuzberg.kreuzcrawl.Kreuzcrawl;
import dev.kreuzberg.kreuzcrawl.ScrapeResult;

public final class BasicUsage {
    private BasicUsage() { }

    public static void main(final String[] args) throws Exception {
        // Simplest case: scrape a single page with default settings.
        CrawlEngineHandle engine = Kreuzcrawl.createEngine();
        ScrapeResult result = Kreuzcrawl.scrape(engine, "https://example.com/");
        System.out.println("Title: " + result.metadata().title());
        System.out.println("Status: " + result.statusCode());
        System.out.println("Links found: " + result.links().size());

        // Crawl from a seed URL, limited to one hop and a handful of pages.
        CrawlConfig config = CrawlConfig.builder()
            .withMaxDepth(Optional.of(1L))
            .withMaxPages(Optional.of(5L))
            .build();
        CrawlEngineHandle crawlEngine = Kreuzcrawl.createEngine(config);
        CrawlResult crawlResult = Kreuzcrawl.crawl(
            crawlEngine,
            "https://en.wikipedia.org/wiki/Web_scraping"
        );
        System.out.println("Pages crawled: " + crawlResult.pages().size());
    }
}

C#

using System;
using System.Threading.Tasks;

using Kreuzcrawl;

internal static class BasicUsage
{
    public static async Task Main()
    {
        // Simplest case: scrape a single page with default settings.
        var engine = KreuzcrawlLib.CreateEngine(null);
        var result = await KreuzcrawlLib.Scrape(engine, "https://example.com/");
        Console.WriteLine($"Title: {result.Metadata.Title}");
        Console.WriteLine($"Status: {result.StatusCode}");
        Console.WriteLine($"Links found: {result.Links.Count}");

        // Crawl from a seed URL, limited to one hop and a handful of pages.
        var config = new CrawlConfig
        {
            MaxDepth = 1,
            MaxPages = 5,
        };
        var crawlEngine = KreuzcrawlLib.CreateEngine(config);
        var crawlResult = await KreuzcrawlLib.Crawl(
            crawlEngine,
            "https://en.wikipedia.org/wiki/Web_scraping"
        );
        Console.WriteLine($"Pages crawled: {crawlResult.Pages.Count}");
    }
}

PHP

<?php
declare(strict_types=1);

use Kreuzcrawl\CrawlConfig;
use Kreuzcrawl\Kreuzcrawl;

// Simplest case: scrape a single page with default settings.
$engine = Kreuzcrawl::createEngine(null);
$result = Kreuzcrawl::scrape($engine, "https://example.com/");
echo "Title: " . ($result->metadata->title ?? "") . "\n";
echo "Status: " . $result->statusCode . "\n";
echo "Links found: " . count($result->links) . "\n";

// Crawl from a seed URL, limited to one hop and a handful of pages.
$config = CrawlConfig::default();
$config->maxDepth = 1;
$config->maxPages = 5;
$crawlEngine = Kreuzcrawl::createEngine($config);
$crawlResult = Kreuzcrawl::crawl($crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping");
echo "Pages crawled: " . count($crawlResult->pages) . "\n";

Elixir

# Simplest case: scrape a single page with default settings.
{:ok, engine} = Kreuzcrawl.create_engine()
{:ok, scrape_json} = Kreuzcrawl.scrape_async(engine, "https://example.com/")
scrape = Jason.decode!(scrape_json)
IO.puts("Title: #{scrape["metadata"]["title"]}")
IO.puts("Status: #{scrape["status_code"]}")
IO.puts("Links found: #{length(scrape["links"] || [])}")

# Crawl from a seed URL, limited to one hop and a handful of pages.
config_json = Jason.encode!(%Kreuzcrawl.CrawlConfig{max_depth: 1, max_pages: 5})
{:ok, crawl_engine} = Kreuzcrawl.create_engine(config_json)
{:ok, crawl_json} =
  Kreuzcrawl.crawl_async(crawl_engine, "https://en.wikipedia.org/wiki/Web_scraping")
crawl = Jason.decode!(crawl_json)
IO.puts("Pages crawled: #{length(crawl["pages"] || [])}")

WASM

import { CrawlConfig, crawl, createEngine, scrape } from "@kreuzberg/kreuzcrawl-wasm";

async function main() {
  // Simplest case: scrape a single page with default settings.
  const engine = createEngine();
  const result = await scrape(engine, "https://example.com/");
  console.log(`Title: ${result.metadata?.title ?? ""}`);
  console.log(`Status: ${result.statusCode}`);
  console.log(`Links found: ${result.links?.length ?? 0}`);

  // Crawl from a seed URL, limited to one hop and a handful of pages.
  const config = new CrawlConfig();
  config.maxDepth = 1;
  config.maxPages = 5;
  const crawlEngine = createEngine(config);
  const crawlResult = await crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping");
  console.log(`Pages crawled: ${crawlResult.pages?.length ?? 0}`);
}

main().catch((error) => console.error(error));

Dart

import 'package:kreuzcrawl/kreuzcrawl.dart';
import 'package:kreuzcrawl/src/kreuzcrawl_bridge_generated/frb_generated.dart'
    show RustLib;

Future<void> main() async {
  await RustLib.init();

  // Simplest case: scrape a single page with default settings.
  final engine = await KreuzcrawlBridge.createEngine();
  final result = await KreuzcrawlBridge.scrape(engine, 'https://example.com/');
  print('Title: ${result.metadata.title ?? ''}');
  print('Status: ${result.statusCode}');
  print('Links found: ${result.links.length}');

  // Crawl from a seed URL, limited to one hop and a handful of pages.
  final crawlConfig = await createCrawlConfigFromJson(
    json: r'{"max_depth":1,"max_pages":5}',
  );
  final crawlEngine = await KreuzcrawlBridge.createEngine(config: crawlConfig);
  final crawlResult = await KreuzcrawlBridge.crawl(
    crawlEngine,
    'https://en.wikipedia.org/wiki/Web_scraping',
  );
  print('Pages crawled: ${crawlResult.pages.length}');
}

Kotlin (Android)

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.PropertyNamingStrategies
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import dev.kreuzberg.kreuzcrawl.android.CrawlConfig
import dev.kreuzberg.kreuzcrawl.android.Kreuzcrawl
import kotlinx.coroutines.runBlocking

fun main() = runBlocking {
    // Simplest case: scrape a single page with default settings.
    val engine = Kreuzcrawl.createEngine()
    val result = Kreuzcrawl.scrapeAsync(engine, "https://example.com/")
    println("Title: ${result.metadata.title}")
    println("Status: ${result.statusCode}")
    println("Links found: ${result.links.size}")

    // Crawl from a seed URL, limited to one hop and a handful of pages.
    val mapper = ObjectMapper()
        .registerKotlinModule()
        .setPropertyNamingStrategy(PropertyNamingStrategies.SNAKE_CASE)
    val config = mapper.readValue(
        "{\"max_depth\":1,\"max_pages\":5}",
        CrawlConfig::class.java,
    )
    val crawlEngine = Kreuzcrawl.createEngine(config)
    val crawlResult = Kreuzcrawl.crawlAsync(
        crawlEngine,
        "https://en.wikipedia.org/wiki/Web_scraping",
    )
    println("Pages crawled: ${crawlResult.pages.size}")
}

Swift

import Foundation
import Kreuzcrawl

@main
struct BasicUsage {
    static func main() async throws {
        // Simplest case: scrape a single page with default settings.
        let engine = try createEngine(nil)
        let result = try await scrape(engine, "https://example.com/")
        print("Title: \(result.metadata().title()?.toString() ?? "")")
        print("Status: \(result.status_code())")
        print("Links found: \(result.links().count)")

        // Crawl from a seed URL, limited to one hop and a handful of pages.
        let crawlConfig = try crawlConfigFromJson("{\"max_depth\":1,\"max_pages\":5}")
        let crawlEngine = try createEngine(crawlConfig)
        let crawlResult = try await crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping")
        print("Pages crawled: \(crawlResult.pages().count)")
    }
}

Zig

const std = @import("std");
const kreuzcrawl = @import("kreuzcrawl");

pub fn main() !void {
    var gpa: std.heap.DebugAllocator(.{}) = .init;
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    // Simplest case: scrape a single page with default settings.
    const scrape_json = try kreuzcrawl.scrape(null, "https://example.com/");
    defer std.heap.c_allocator.free(scrape_json);
    var scrape_parsed = try std.json.parseFromSlice(std.json.Value, allocator, scrape_json, .{});
    defer scrape_parsed.deinit();
    const result = &scrape_parsed.value;
    const title = result.object.get("metadata").?.object.get("title").?;
    std.debug.print("Title: {s}\n", .{if (title == .string) title.string else ""});
    std.debug.print("Status: {d}\n", .{result.object.get("status_code").?.integer});
    std.debug.print("Links found: {d}\n", .{result.object.get("links").?.array.items.len});

    // Crawl from a seed URL, limited to one hop and a handful of pages.
    const crawl_json = try kreuzcrawl.crawl(
        "{\"max_depth\":1,\"max_pages\":5}",
        "https://en.wikipedia.org/wiki/Web_scraping",
    );
    defer std.heap.c_allocator.free(crawl_json);
    var crawl_parsed = try std.json.parseFromSlice(std.json.Value, allocator, crawl_json, .{});
    defer crawl_parsed.deinit();
    const crawl_result = &crawl_parsed.value;
    std.debug.print("Pages crawled: {d}\n", .{crawl_result.object.get("pages").?.array.items.len});
}

For deeper walkthroughs of scrape, crawl, and map operations — including configuration options, link filtering, and result-shape details — see the Quick Start and the per-operation guides under Guides → Core.

Edit this page on GitHub