Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,19 @@ const cli = cac("sitefetch")
cli
.command("[url]", "Fetch a site")
.option("-o, --outfile <path>", "Write the fetched site to a text file")
.option("--format <format>", "Output format (json or text)", {
default: "text",
})
.option("--concurrency <number>", "Number of concurrent requests", {
default: 3,
})
.option("-m, --match <pattern>", "Only fetch matched pages")
.option("--content-selector <selector>", "The CSS selector to find content")
.option("--limit <limit>", "Limit the result to this amount of pages")
.option("--silent", "Do not print any logs")
.option("--save-interval <number>", "Save to file after processing this many pages", {
default: 10,
})
.action(async (url, flags) => {
if (!url) {
cli.outputHelp()
Expand All @@ -34,6 +40,9 @@ cli
match: flags.match && ensureArray(flags.match),
contentSelector: flags.contentSelector,
limit: flags.limit,
outputFile: flags.outfile,
format: flags.format,
saveFrequency: flags["save-interval"],
})

if (pages.size === 0) {
Expand All @@ -55,14 +64,11 @@ cli
)

if (flags.outfile) {
const output = serializePages(
pages,
flags.outfile.endsWith(".json") ? "json" : "text"
)
const output = serializePages(pages, flags.format)
fs.mkdirSync(path.dirname(flags.outfile), { recursive: true })
fs.writeFileSync(flags.outfile, output, "utf8")
} else {
console.log(serializePages(pages, "text"))
console.log(serializePages(pages, flags.format))
}
})

Expand Down
32 changes: 32 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { logger } from "./logger.ts"
import { load } from "cheerio"
import { matchPath } from "./utils.ts"
import type { Options, FetchSiteResult } from "./types.ts"
import { writeFileSync } from "fs"

export async function fetchSite(
url: string,
Expand All @@ -21,6 +22,7 @@ class Fetcher {
#pages: FetchSiteResult = new Map()
#fetched: Set<string> = new Set()
#queue: Queue
#lastSaveCount: number = 0

constructor(public options: Options) {
const concurrency = options.concurrency || 3
Expand All @@ -38,6 +40,27 @@ class Fetcher {
return this.options.contentSelector
}

#shouldSave() {
if (!this.options.outputFile) return false
const frequency = this.options.saveFrequency || 10
return (this.#pages.size - this.#lastSaveCount) >= frequency
}

#saveToFile() {
if (!this.options.outputFile) return

const format = this.options.format || "json"
const content = serializePages(this.#pages, format)

try {
writeFileSync(this.options.outputFile, content)
this.#lastSaveCount = this.#pages.size
logger.info(`Saved ${this.#pages.size} pages to ${this.options.outputFile}`)
} catch (error) {
logger.warn(`Failed to save to file: ${error.message}`)
}
}

async fetchSite(url: string) {
logger.info(
`Started fetching ${c.green(url)} with a concurrency of ${
Expand All @@ -51,6 +74,11 @@ class Fetcher {

await this.#queue.onIdle()

// Final save to ensure we catch any remaining pages
if (this.options.outputFile) {
this.#saveToFile()
}

return this.#pages
}

Expand Down Expand Up @@ -180,6 +208,10 @@ class Fetcher {
url,
content,
})

if (this.#shouldSave()) {
this.#saveToFile()
}
}
}

Expand Down
15 changes: 15 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,21 @@ export type Options = {
*/
limit?: number

/**
* Save results incrementally to a file
*/
outputFile?: string

/**
* How many pages to process before saving to file (default: 10)
*/
saveFrequency?: number

/**
* Output format for saving (default: "json")
*/
format?: "json" | "text"

/**
* A custom function to fetch URL
*/
Expand Down