Skip to content

Commit

Permalink
refactor importKudos to match new typesense schema
Browse files Browse the repository at this point in the history
- partition generation jobs by doc_num
  • Loading branch information
bdb-dd committed Sep 27, 2024
1 parent ba813c1 commit 3df7a8a
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 15 deletions.
15 changes: 14 additions & 1 deletion cli/src/functions/generateSearchPhrases.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type SearchPhraseList = z.infer<typeof SearchPhraseListSchema>;

type SearchHit = {
id: string;
doc_num: string;
url: string;
contentMarkdown: string;
};
Expand Down Expand Up @@ -70,7 +71,9 @@ async function main() {
.requiredOption('-s, --source <string>', 'collection to extract from')
.requiredOption('-t, --target <string>', 'target collection name')
.option('--prompt <string>, ', 'prompt name', 'original')
.option('-n', 'create new target collection');
.option('-n', 'create new target collection')
.option('--partitions <number>', 'number of partitions to divide the work into')
.option('--partition <number>', 'partition number for this process [0 < partition < partitions]');

program.parse(process.argv);
const opts = program.opts();
Expand Down Expand Up @@ -141,6 +144,7 @@ async function main() {
result.grouped_hits.flatMap((hit: any) =>
hit.hits.map((document: any) => ({
id: document.document.id,
doc_num: document.document.doc_num,
url: document.document.url_without_anchor,
contentMarkdown: document.document.content_markdown || '',
})),
Expand All @@ -160,6 +164,13 @@ async function main() {
const searchHit = searchHits[docIndex];
const url = searchHit.url;

const partitionId = parseInt(searchHit.doc_num) % opts.partitions;
if (partitionId != opts.partition) {
console.log(`Skipping document ${searchHit.doc_num} (partition ${partitionId})`);
docIndex++;
continue;
}

// console.log(`searchHit: ${JSON.stringify(searchHit)}`);

const existingPhrases = await lookupSearchPhrases(url, targetCollectionName, promptName);
Expand Down Expand Up @@ -230,7 +241,9 @@ async function main() {
console.log(phrase);
const entry: SearchPhraseEntry = {
id: '' + searchHit.id + '-' + index,
doc_num: '' + searchHit.id,
chunk_id: '' + searchHit.id || '',
chunk_index: index,
url: url,
search_phrase: phrase,
sort_order: index,
Expand Down
40 changes: 27 additions & 13 deletions cli/src/functions/importKudos.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ const kudosDocTypesenseSchema: CollectionCreateSchema = {

const KudosDocSchema = z.object({
id: z.string().optional(),
doc_num: z.string(),
url_without_anchor: z.string(),
updated_at: z.number(),
uuid: z.string(),
Expand All @@ -112,6 +113,7 @@ type KudosDoc = z.infer<typeof KudosDocSchema>;
const KudosChunkSchema = z.object({
id: z.string().optional(),
doc_num: z.string(),
chunk_id: z.string(),
chunk_index: z.number(),
content_markdown: z.string(),
url: z.string(),
Expand All @@ -126,7 +128,7 @@ const KudosChunkSchema = z.object({
type KudosChunk = z.infer<typeof KudosChunkSchema>;

const kudosChunkTypesenseSchema = (
collectionName: string,
docsCollectionName: string,
): CollectionCreateSchema => {
return {
name: 'kudos_chunks',
Expand All @@ -150,7 +152,7 @@ const kudosChunkTypesenseSchema = (
"locale": "",
"name": "doc_num",
"optional": false,
"reference": collectionName + ".doc_num",
"reference": docsCollectionName + ".doc_num",
"sort": false,
"stem": false,
"store": true,
Expand Down Expand Up @@ -314,7 +316,7 @@ async function main() {
const docCollectionFound = await typesenseSearch.lookupCollectionByNameExact(docCollectionName);

if (!chunkCollectionName) {
chunkCollectionName = docCollectionName + '_chunks';
chunkCollectionName = docCollectionName.replace('docs', 'chunks');
}
const chunkCollectionFound =
await typesenseSearch.lookupCollectionByNameExact(chunkCollectionName);
Expand Down Expand Up @@ -349,8 +351,8 @@ async function main() {
} catch (error) {
if (error instanceof Errors.ObjectNotFound) {
console.log('Creating new collection:', chunkCollectionName);
kudosChunkTypesenseSchema.name = chunkCollectionName;
await typesenseClient.collections().create(kudosChunkTypesenseSchema);
let chunkSchema = kudosChunkTypesenseSchema(docCollectionName);
await typesenseClient.collections().create(chunkSchema);
console.log(`Kudos chunk collection ${chunkCollectionName} created successfully.`);
} else {
throw error;
Expand Down Expand Up @@ -410,13 +412,18 @@ async function main() {

while (true) {
console.log(`Loading ${opts.pagesize} documents from page ${page}`);
const [rows] = await connection.execute(
"SELECT * FROM documents WHERE (type = 'Evaluering')" +
" AND (published_at > '2021-01-01 00:00:00.000') AND (published_at < '2023-01-01 00:00:00.000')" +
' LIMIT ? OFFSET ?',
const [rowResults] = await connection.execute(
"SELECT * FROM documents " +
"WHERE ((type = 'Evaluering') OR (type = 'Årsrapport')) " +
" AND (published_at > '2020-01-01 00:00:00.000') " +
" AND (published_at < '2024-01-01 00:00:00.000') " +
//" ORDER BY published_at asc " +
" LIMIT ? OFFSET ?",
[opts.pagesize, (page - 1) * opts.pagesize],
);
if (rows.length == 0) break;
const rows = rowResults as mysql.RowDataPacket[];
// console.log(`query results metadata: ${JSON.stringify(metadata)}`);
if (!Array.isArray(rows) || rows.length === 0) break;
if (opts.pages >= 0 && page - opts.firstpage >= opts.pages) break;

for (let i = 0; i < rows.length; i++) {
Expand Down Expand Up @@ -446,6 +453,7 @@ async function main() {

const doc: KudosDoc = {
id: '' + row.id,
doc_num: '' + row.id,
uuid: row.uuid,
url_without_anchor: row.source_document_url ? row.source_document_url : "https://unknown",
updated_at: Math.floor(new Date().getTime() / 1000),
Expand Down Expand Up @@ -480,7 +488,7 @@ async function main() {
.import([doc], { action: 'upsert' });
}

let batch = [];
let batch: KudosChunk[] = [];

let chunkStart = 0;
for (let chunkIndex = 0; chunkIndex < chunkLengths.length; chunkIndex++) {
Expand All @@ -490,13 +498,14 @@ async function main() {
console.log(
`--- Doc id: ${row.id}, chunk: ${chunkIndex + 1} of ${chunkLengths.length} -- Tokens: ${tokens.length} -- Length: ${chunkLengths[chunkIndex] - chunkStart} ------------------------`,
);
console.log(chunkText);
// console.log(chunkText);
chunkStart = chunkLengths[chunkIndex] + 1;

const markdown_checksum = sha1(chunkText);

const outChunk: KudosChunk = {
id: '' + row.id + '-' + chunkIndex,
chunk_id: '' + row.id + '-' + chunkIndex,
doc_num: '' + row.id,
chunk_index: chunkIndex,
content_markdown: chunkText,
Expand All @@ -509,10 +518,15 @@ async function main() {
markdown_checksum: markdown_checksum,
token_count: tokens.length,
};

batch.push(outChunk);

if (!opts.dryrun) {
if (batch.length === chunkImportBatchSize || chunkIndex === chunkLengths.length - 1) {
if (batch.length == 0) {
console.log(`No chunks found for doc_num: ${row.id}`);
} else if (
batch.length === chunkImportBatchSize
|| chunkIndex === chunkLengths.length - 1) {
console.log(`Uploading ${batch.length} chunks for doc id ${row.id}`);
await typesenseClient
.collections(chunkCollectionName)
Expand Down
3 changes: 2 additions & 1 deletion cli/src/lib/typesense-search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export interface SearchPhraseDoc {
export interface SearchPhraseEntry {
id?: string;
chunk_id?: string;
chunk_index?: number;
doc_num: string;
url: string;
search_phrase: string;
Expand Down Expand Up @@ -128,7 +129,7 @@ export async function typesenseRetrieveAllUrls(
collectionName: string,
page: number,
pageSize: number,
includeFields: string = 'url_without_anchor,content_markdown,id',
includeFields: string = 'doc_num,url_without_anchor,content_markdown,id',
): Promise<any> {
const client = new typesense.Client(cfg.TYPESENSE_CONFIG);

Expand Down

0 comments on commit 3df7a8a

Please sign in to comment.