Skip to content

Commit 452ec9e

Browse files
Merge pull request #166 from CodeForPhilly/feat/zbl-enahancedsearch
Feat/zbl enahancedsearch
2 parents 0c157bb + 7b1e8fb commit 452ec9e

File tree

5 files changed

+150
-12
lines changed

5 files changed

+150
-12
lines changed

Dockerfile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
FROM node@sha256:426f843809ae05f324883afceebaa2b9cab9cb697097dbb1a2a7a41c5701de72
1+
FROM node:20-slim
22

33
# Install ImageMagick and AWS CLI
4-
RUN apk add --no-cache imagemagick aws-cli
4+
RUN apt-get update && apt-get install -y \
5+
imagemagick \
6+
awscli \
7+
&& rm -rf /var/lib/apt/lists/*
58

69
# Set NODE_OPTIONS for OpenSSL compatibility and limit memory usage
710
ENV NODE_OPTIONS="--openssl-legacy-provider --max-old-space-size=768"

app.js

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,45 @@
11
const db = require('./lib/db');
22
const server = require('./lib/server');
3+
const { generateEmbeddings } = require('./scripts/generate-embeddings');
34

45
run();
56

67
async function run() {
7-
const { plants, nurseries } = await db();
8-
await server({ plants, nurseries });
8+
const { plants, nurseries, close } = await db();
9+
10+
// Check if embeddings exist
11+
const embeddingCount = await plants.countDocuments({
12+
embedding: { $exists: true, $ne: null }
13+
});
14+
15+
if (embeddingCount === 0) {
16+
console.log('\n⚠️ No embeddings found in database.');
17+
console.log('🔄 Auto-generating embeddings... This may take a few minutes.\n');
18+
19+
try {
20+
// Close the current connection since generateEmbeddings will create its own
21+
await close();
22+
23+
// Generate embeddings (this will create its own DB connection)
24+
await generateEmbeddings();
25+
26+
console.log('\n✅ Embeddings generated successfully!');
27+
console.log('🔄 Reconnecting to database...\n');
28+
29+
// Reconnect to database
30+
const dbConnection = await db();
31+
await server({ plants: dbConnection.plants, nurseries: dbConnection.nurseries });
32+
} catch (error) {
33+
console.error('\n❌ Error generating embeddings:', error);
34+
console.error('⚠️ Server will start without embeddings. Semantic search will not work.');
35+
console.error('💡 Run manually: npm run generate-embeddings\n');
36+
37+
// Start server anyway (without embeddings)
38+
const dbConnection = await db();
39+
await server({ plants: dbConnection.plants, nurseries: dbConnection.nurseries });
40+
}
41+
} else {
42+
console.log(`✅ Found ${embeddingCount} plants with embeddings.`);
43+
await server({ plants, nurseries });
44+
}
945
}

lib/embeddings.js

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,53 @@
22
let transformersModule = null;
33
let embeddingModel = null;
44

5+
/**
6+
* Ensure global.fetch is properly overridden to fix malformed URLs
7+
* This is a backup in case the SSR bundle overwrites it
8+
*/
9+
function ensureFetchOverride() {
10+
if (typeof global !== 'undefined') {
11+
const nodeFetch = require('node-fetch');
12+
13+
const fixedFetch = function(url, options) {
14+
let fetchUrl = url;
15+
16+
if (typeof url === 'string') {
17+
// Fix malformed URLs like "http://localhost:3000https://huggingface.co/..."
18+
if (url.includes('localhost') && url.includes('huggingface.co')) {
19+
// Extract the actual HuggingFace URL (everything from https:// onwards)
20+
const httpsMatch = url.match(/https?:\/\/[^/]*huggingface\.co\/.*/);
21+
if (httpsMatch) {
22+
fetchUrl = httpsMatch[0];
23+
} else {
24+
// Fallback: find https:// in the string and use everything from there
25+
const httpsIndex = url.indexOf('https://');
26+
if (httpsIndex !== -1) {
27+
fetchUrl = url.substring(httpsIndex);
28+
}
29+
}
30+
}
31+
}
32+
33+
// Use node-fetch for all requests (more reliable in Node.js environment)
34+
return nodeFetch(fetchUrl, options);
35+
};
36+
37+
// Always override to ensure it's set correctly
38+
global.fetch = fixedFetch;
39+
if (typeof globalThis !== 'undefined') {
40+
globalThis.fetch = fixedFetch;
41+
}
42+
}
43+
}
44+
545
/**
646
* Get the transformers module using dynamic import (required for ES modules)
747
*/
848
async function getTransformersModule() {
949
if (!transformersModule) {
50+
// Ensure fetch is overridden before importing transformers
51+
ensureFetchOverride();
1052
transformersModule = await import('@xenova/transformers');
1153
}
1254
return transformersModule;
@@ -20,6 +62,10 @@ async function getEmbeddingModel() {
2062
if (!embeddingModel) {
2163
console.log('Loading embedding model...');
2264
const { pipeline } = await getTransformersModule();
65+
66+
// global.fetch is already overridden at the top of this file
67+
// to fix malformed URLs from SSR webpack polyfills
68+
2369
embeddingModel = await pipeline(
2470
'feature-extraction',
2571
'Xenova/all-MiniLM-L6-v2' // Lightweight model, ~80MB, good for semantic search

lib/server.js

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,39 @@ if (!isDevelopment) {
1616
manifest = require("../ssr/ssr-manifest.json");
1717
const appPath = path.join(__dirname, "../ssr", manifest["app.js"]);
1818
renderer = require(appPath).default;
19+
20+
// Override global.fetch AFTER SSR bundle loads to fix malformed URLs
21+
// The SSR bundle sets its own fetch that prepends localhost:3000 to HuggingFace URLs
22+
const nodeFetch = require('node-fetch');
23+
const fixedFetch = function(url, options) {
24+
let fetchUrl = url;
25+
26+
if (typeof url === 'string') {
27+
// Fix malformed URLs like "http://localhost:3000https://huggingface.co/..."
28+
if (url.includes('localhost') && url.includes('huggingface.co')) {
29+
// Extract the actual HuggingFace URL (everything from https:// onwards)
30+
const httpsMatch = url.match(/https?:\/\/[^/]*huggingface\.co\/.*/);
31+
if (httpsMatch) {
32+
fetchUrl = httpsMatch[0];
33+
} else {
34+
// Fallback: find https:// in the string and use everything from there
35+
const httpsIndex = url.indexOf('https://');
36+
if (httpsIndex !== -1) {
37+
fetchUrl = url.substring(httpsIndex);
38+
}
39+
}
40+
}
41+
}
42+
43+
// Use node-fetch for all requests (more reliable in Node.js environment)
44+
return nodeFetch(fetchUrl, options);
45+
};
46+
47+
// Override on both global and globalThis to ensure it's caught
48+
global.fetch = fixedFetch;
49+
if (typeof globalThis !== 'undefined') {
50+
globalThis.fetch = fixedFetch;
51+
}
1952
} catch (e) {
2053
console.warn("SSR build not found, running in development mode");
2154
}
@@ -557,6 +590,7 @@ module.exports = async function ({ plants, nurseries }) {
557590
// Fetch all candidates (we'll calculate similarity and sort in JS)
558591
// For performance, we might want to limit this, but for now get all matching filters
559592
const candidates = await plants.find(baseQuery).toArray();
593+
console.log(`Semantic search: Found ${candidates.length} candidates with embeddings`);
560594

561595
// Calculate similarity scores
562596
const queryEmbedding = query._queryEmbedding;
@@ -573,7 +607,12 @@ module.exports = async function ({ plants, nurseries }) {
573607
_semanticScore: similarity
574608
};
575609
})
576-
.filter(plant => plant && plant._semanticScore >= 0.3) // Minimum similarity threshold
610+
.filter(plant => {
611+
if (!plant || plant._semanticScore < 0.3) {
612+
return false;
613+
}
614+
return true;
615+
}) // Minimum similarity threshold
577616
.sort((a, b) => {
578617
// If user wants to sort by Search Relevance, prioritize semantic score
579618
if (originalSortKeys.includes("_semanticScore") || req.query.sort === "Sort by Search Relevance") {

scripts/generate-embeddings.js

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,30 @@ async function generateEmbeddings() {
7272
}
7373

7474
await close();
75-
process.exit(0);
75+
return { processed, updated, skipped };
7676
}
7777

78-
// Handle errors
79-
process.on('unhandledRejection', (error) => {
80-
console.error('Unhandled error:', error);
81-
process.exit(1);
82-
});
78+
// Export for use in other modules
79+
module.exports = { generateEmbeddings };
80+
81+
// If run directly as a script, execute it
82+
if (require.main === module) {
83+
// Handle errors for standalone execution
84+
process.on('unhandledRejection', (error) => {
85+
console.error('Unhandled error:', error);
86+
process.exit(1);
87+
});
88+
89+
generateEmbeddings()
90+
.then(() => {
91+
process.exit(0);
92+
})
93+
.catch((error) => {
94+
console.error('Unhandled error:', error);
95+
process.exit(1);
96+
});
97+
}
8398

84-
generateEmbeddings();
8599

86100

87101

0 commit comments

Comments
 (0)