Basic Usage
Basic Usage¶
A single end-to-end example showing the two core operations — scrape a page,
then crawl a small site — in every supported binding. Each tab is the canonical
"hello world" for that language: create_engine, scrape(engine, url),
CrawlConfig(max_depth=1, max_pages=5), crawl(engine, url).
Python
import asyncio
from kreuzcrawl import CrawlConfig, create_engine, crawl, scrape
async def main() -> None:
# Simplest case: scrape a single page with default settings.
engine = create_engine()
result = await scrape(engine, "https://example.com/")
print(f"Title: {result.metadata.title}")
print(f"Status: {result.status_code}")
print(f"Links found: {len(result.links)}")
# Crawl from a seed URL, limited to one hop and a handful of pages.
crawl_engine = create_engine(CrawlConfig(max_depth=1, max_pages=5))
crawl_result = await crawl(crawl_engine, "https://en.wikipedia.org/wiki/Web_scraping")
print(f"Pages crawled: {len(crawl_result.pages)}")
if __name__ == "__main__":
asyncio.run(main())
TypeScript
import { crawl, createEngine, scrape } from "@kreuzberg/kreuzcrawl";
async function main(): Promise<void> {
// Simplest case: scrape a single page with default settings.
const engine = createEngine();
const result = await scrape(engine, "https://example.com/");
console.log(`Title: ${result.metadata?.title ?? ""}`);
console.log(`Status: ${result.statusCode}`);
console.log(`Links found: ${result.links?.length ?? 0}`);
// Crawl from a seed URL, limited to one hop and a handful of pages.
const crawlEngine = createEngine({ maxDepth: 1, maxPages: 5 });
const crawlResult = await crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping");
console.log(`Pages crawled: ${crawlResult.pages?.length ?? 0}`);
}
main().catch((error) => {
console.error(error);
process.exit(1);
});
Ruby
require "kreuzcrawl"
# Simplest case: scrape a single page with default settings.
engine = Kreuzcrawl.create_engine
result = Kreuzcrawl.scrape(engine, "https://example.com/")
puts "Title: #{result.metadata.title}"
puts "Status: #{result.status_code}"
puts "Links found: #{result.links.length}"
# Crawl from a seed URL, limited to one hop and a handful of pages.
config = Kreuzcrawl::CrawlConfig.new(max_depth: 1, max_pages: 5)
crawl_engine = Kreuzcrawl.create_engine(config)
crawl_result = Kreuzcrawl.crawl(crawl_engine, "https://en.wikipedia.org/wiki/Web_scraping")
puts "Pages crawled: #{crawl_result.pages.length}"
Go
package main
import (
"fmt"
"log"
kreuzcrawl "github.com/kreuzberg-dev/kreuzcrawl/packages/go"
)
func main() {
// Simplest case: scrape a single page with default settings.
engine, err := kreuzcrawl.CreateEngine(nil)
if err != nil {
log.Fatalf("create engine: %v", err)
}
result, err := kreuzcrawl.Scrape(engine, "https://example.com/")
if err != nil {
log.Fatalf("scrape: %v", err)
}
title := ""
if result.Metadata.Title != nil {
title = *result.Metadata.Title
}
fmt.Printf("Title: %s\n", title)
fmt.Printf("Status: %d\n", result.StatusCode)
fmt.Printf("Links found: %d\n", len(result.Links))
// Crawl from a seed URL, limited to one hop and a handful of pages.
config := kreuzcrawl.NewCrawlConfig(
kreuzcrawl.WithCrawlConfigMaxDepth(1),
kreuzcrawl.WithCrawlConfigMaxPages(5),
)
crawlEngine, err := kreuzcrawl.CreateEngine(config)
if err != nil {
log.Fatalf("create crawl engine: %v", err)
}
crawlResult, err := kreuzcrawl.Crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping")
if err != nil {
log.Fatalf("crawl: %v", err)
}
fmt.Printf("Pages crawled: %d\n", len(crawlResult.Pages))
}
Java
import java.util.Optional;
import dev.kreuzberg.kreuzcrawl.CrawlConfig;
import dev.kreuzberg.kreuzcrawl.CrawlEngineHandle;
import dev.kreuzberg.kreuzcrawl.CrawlResult;
import dev.kreuzberg.kreuzcrawl.Kreuzcrawl;
import dev.kreuzberg.kreuzcrawl.ScrapeResult;
public final class BasicUsage {
private BasicUsage() { }
public static void main(final String[] args) throws Exception {
// Simplest case: scrape a single page with default settings.
CrawlEngineHandle engine = Kreuzcrawl.createEngine();
ScrapeResult result = Kreuzcrawl.scrape(engine, "https://example.com/");
System.out.println("Title: " + result.metadata().title());
System.out.println("Status: " + result.statusCode());
System.out.println("Links found: " + result.links().size());
// Crawl from a seed URL, limited to one hop and a handful of pages.
CrawlConfig config = CrawlConfig.builder()
.withMaxDepth(Optional.of(1L))
.withMaxPages(Optional.of(5L))
.build();
CrawlEngineHandle crawlEngine = Kreuzcrawl.createEngine(config);
CrawlResult crawlResult = Kreuzcrawl.crawl(
crawlEngine,
"https://en.wikipedia.org/wiki/Web_scraping"
);
System.out.println("Pages crawled: " + crawlResult.pages().size());
}
}
C#
using System;
using System.Threading.Tasks;
using Kreuzcrawl;
internal static class BasicUsage
{
public static async Task Main()
{
// Simplest case: scrape a single page with default settings.
var engine = KreuzcrawlLib.CreateEngine(null);
var result = await KreuzcrawlLib.Scrape(engine, "https://example.com/");
Console.WriteLine($"Title: {result.Metadata.Title}");
Console.WriteLine($"Status: {result.StatusCode}");
Console.WriteLine($"Links found: {result.Links.Count}");
// Crawl from a seed URL, limited to one hop and a handful of pages.
var config = new CrawlConfig
{
MaxDepth = 1,
MaxPages = 5,
};
var crawlEngine = KreuzcrawlLib.CreateEngine(config);
var crawlResult = await KreuzcrawlLib.Crawl(
crawlEngine,
"https://en.wikipedia.org/wiki/Web_scraping"
);
Console.WriteLine($"Pages crawled: {crawlResult.Pages.Count}");
}
}
PHP
<?php
declare(strict_types=1);
use Kreuzcrawl\CrawlConfig;
use Kreuzcrawl\Kreuzcrawl;
// Simplest case: scrape a single page with default settings.
$engine = Kreuzcrawl::createEngine(null);
$result = Kreuzcrawl::scrape($engine, "https://example.com/");
echo "Title: " . ($result->metadata->title ?? "") . "\n";
echo "Status: " . $result->statusCode . "\n";
echo "Links found: " . count($result->links) . "\n";
// Crawl from a seed URL, limited to one hop and a handful of pages.
$config = CrawlConfig::default();
$config->maxDepth = 1;
$config->maxPages = 5;
$crawlEngine = Kreuzcrawl::createEngine($config);
$crawlResult = Kreuzcrawl::crawl($crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping");
echo "Pages crawled: " . count($crawlResult->pages) . "\n";
Elixir
# Simplest case: scrape a single page with default settings.
{:ok, engine} = Kreuzcrawl.create_engine()
{:ok, scrape_json} = Kreuzcrawl.scrape_async(engine, "https://example.com/")
scrape = Jason.decode!(scrape_json)
IO.puts("Title: #{scrape["metadata"]["title"]}")
IO.puts("Status: #{scrape["status_code"]}")
IO.puts("Links found: #{length(scrape["links"] || [])}")
# Crawl from a seed URL, limited to one hop and a handful of pages.
config_json = Jason.encode!(%Kreuzcrawl.CrawlConfig{max_depth: 1, max_pages: 5})
{:ok, crawl_engine} = Kreuzcrawl.create_engine(config_json)
{:ok, crawl_json} =
Kreuzcrawl.crawl_async(crawl_engine, "https://en.wikipedia.org/wiki/Web_scraping")
crawl = Jason.decode!(crawl_json)
IO.puts("Pages crawled: #{length(crawl["pages"] || [])}")
WASM
import { CrawlConfig, crawl, createEngine, scrape } from "@kreuzberg/kreuzcrawl-wasm";
async function main() {
// Simplest case: scrape a single page with default settings.
const engine = createEngine();
const result = await scrape(engine, "https://example.com/");
console.log(`Title: ${result.metadata?.title ?? ""}`);
console.log(`Status: ${result.statusCode}`);
console.log(`Links found: ${result.links?.length ?? 0}`);
// Crawl from a seed URL, limited to one hop and a handful of pages.
const config = new CrawlConfig();
config.maxDepth = 1;
config.maxPages = 5;
const crawlEngine = createEngine(config);
const crawlResult = await crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping");
console.log(`Pages crawled: ${crawlResult.pages?.length ?? 0}`);
}
main().catch((error) => console.error(error));
Dart
import 'package:kreuzcrawl/kreuzcrawl.dart';
import 'package:kreuzcrawl/src/kreuzcrawl_bridge_generated/frb_generated.dart'
show RustLib;
Future<void> main() async {
await RustLib.init();
// Simplest case: scrape a single page with default settings.
final engine = await KreuzcrawlBridge.createEngine();
final result = await KreuzcrawlBridge.scrape(engine, 'https://example.com/');
print('Title: ${result.metadata.title ?? ''}');
print('Status: ${result.statusCode}');
print('Links found: ${result.links.length}');
// Crawl from a seed URL, limited to one hop and a handful of pages.
final crawlConfig = await createCrawlConfigFromJson(
json: r'{"max_depth":1,"max_pages":5}',
);
final crawlEngine = await KreuzcrawlBridge.createEngine(config: crawlConfig);
final crawlResult = await KreuzcrawlBridge.crawl(
crawlEngine,
'https://en.wikipedia.org/wiki/Web_scraping',
);
print('Pages crawled: ${crawlResult.pages.length}');
}
Kotlin (Android)
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.PropertyNamingStrategies
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import dev.kreuzberg.kreuzcrawl.android.CrawlConfig
import dev.kreuzberg.kreuzcrawl.android.Kreuzcrawl
import kotlinx.coroutines.runBlocking
fun main() = runBlocking {
// Simplest case: scrape a single page with default settings.
val engine = Kreuzcrawl.createEngine()
val result = Kreuzcrawl.scrapeAsync(engine, "https://example.com/")
println("Title: ${result.metadata.title}")
println("Status: ${result.statusCode}")
println("Links found: ${result.links.size}")
// Crawl from a seed URL, limited to one hop and a handful of pages.
val mapper = ObjectMapper()
.registerKotlinModule()
.setPropertyNamingStrategy(PropertyNamingStrategies.SNAKE_CASE)
val config = mapper.readValue(
"{\"max_depth\":1,\"max_pages\":5}",
CrawlConfig::class.java,
)
val crawlEngine = Kreuzcrawl.createEngine(config)
val crawlResult = Kreuzcrawl.crawlAsync(
crawlEngine,
"https://en.wikipedia.org/wiki/Web_scraping",
)
println("Pages crawled: ${crawlResult.pages.size}")
}
Swift
import Foundation
import Kreuzcrawl
@main
struct BasicUsage {
static func main() async throws {
// Simplest case: scrape a single page with default settings.
let engine = try createEngine(nil)
let result = try await scrape(engine, "https://example.com/")
print("Title: \(result.metadata().title()?.toString() ?? "")")
print("Status: \(result.status_code())")
print("Links found: \(result.links().count)")
// Crawl from a seed URL, limited to one hop and a handful of pages.
let crawlConfig = try crawlConfigFromJson("{\"max_depth\":1,\"max_pages\":5}")
let crawlEngine = try createEngine(crawlConfig)
let crawlResult = try await crawl(crawlEngine, "https://en.wikipedia.org/wiki/Web_scraping")
print("Pages crawled: \(crawlResult.pages().count)")
}
}
Zig
const std = @import("std");
const kreuzcrawl = @import("kreuzcrawl");
pub fn main() !void {
var gpa: std.heap.DebugAllocator(.{}) = .init;
defer _ = gpa.deinit();
const allocator = gpa.allocator();
// Simplest case: scrape a single page with default settings.
const scrape_json = try kreuzcrawl.scrape(null, "https://example.com/");
defer std.heap.c_allocator.free(scrape_json);
var scrape_parsed = try std.json.parseFromSlice(std.json.Value, allocator, scrape_json, .{});
defer scrape_parsed.deinit();
const result = &scrape_parsed.value;
const title = result.object.get("metadata").?.object.get("title").?;
std.debug.print("Title: {s}\n", .{if (title == .string) title.string else ""});
std.debug.print("Status: {d}\n", .{result.object.get("status_code").?.integer});
std.debug.print("Links found: {d}\n", .{result.object.get("links").?.array.items.len});
// Crawl from a seed URL, limited to one hop and a handful of pages.
const crawl_json = try kreuzcrawl.crawl(
"{\"max_depth\":1,\"max_pages\":5}",
"https://en.wikipedia.org/wiki/Web_scraping",
);
defer std.heap.c_allocator.free(crawl_json);
var crawl_parsed = try std.json.parseFromSlice(std.json.Value, allocator, crawl_json, .{});
defer crawl_parsed.deinit();
const crawl_result = &crawl_parsed.value;
std.debug.print("Pages crawled: {d}\n", .{crawl_result.object.get("pages").?.array.items.len});
}
For deeper walkthroughs of scrape, crawl, and map operations — including configuration options, link filtering, and result-shape details — see the Quick Start and the per-operation guides under Guides → Core.