LM StudioをScreaming Frogで使うためのCustom JSコード貼っときます。上部のMODEL_IDENTIFIERは使うモデル名に変更。コードはPassage EmbeddingsをLM Studio用に書き換えてます。
// =================================================================================
// Screaming Frog Recipe: Semantic Chunking Local Embeddings via LM Studio
//
// This script is modified to use a local embedding model (like EmbeddingGemma)
// running on LM Studio, which provides an OpenAI-compatible API.
//
// SETUP REQUIRED:
// 1. Install LM Studio on your Mac:
lmstudio.ai/
// 2. Download an embedding model (e.g., "EmbeddingGemma") from the search tab.
// 3. Go to the "Local Server" tab (server icon on the left).
// 4. Select your downloaded embedding model.
// 5. Start the server. Keep LM Studio running while you crawl.
//
// SCRIPT CONFIGURATION:
// - Verify the MODEL_IDENTIFIER matches the one shown in LM Studio.
// - Verify the LM_STUDIO_URL is correct (default is http://localhost:1234).
// =================================================================================
// --- LM Studio Configuration ---
// This should match the model identifier shown in your LM Studio server tab.
const MODEL_IDENTIFIER = 'google/embedding-gemma-v1.5';
// Default URL for LM Studio's local server. Change the port if you have configured it differently.
const LM_STUDIO_URL = 'http://localhost:1234/v1/embeddings';
/**
* Get embeddings for an array of texts using the local LM Studio server.
*
@param {string[]} texts - An array of text strings to embed.
* @returns {Promise<number[][]>} A promise that resolves to an array of embedding vectors.
*/
async function getEmbeddings(texts) {
try {
const response = await fetch(LM_STUDIO_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
// OpenAI-compatible API format
body: JSON.stringify({
input: texts,
model: MODEL_IDENTIFIER,
}),
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`LM Studio API error (${response.status}): ${errorText}`);
}
const data = await response.json();
// The response is a list of objects, each containing an "embedding" array.
// We need to extract just the array for each item.
return
data.data.map(item => item.embedding);
} catch (error) {
// This often happens if the LM Studio server is not running.
console.error('Failed to connect to LM Studio API.', error);
throw new Error(`Could not get embeddings from LM Studio: ${error.message}`);
}
}
// --- Original Script Logic (Mostly Unchanged) ---
// Configuration for chunking
const CONFIG = {
minChunkLength: 50, // Minimum characters per chunk
maxChunkLength: 500, // Maximum characters per chunk
includeMetadata: true, // Include element type and position info
batchSize: 5, // Number of chunks to process in parallel
retryAttempts: 3, // Retry failed requests
retryDelay: 1000, // Delay between retries (ms)
};
/**
* Extract semantic passages from the webpage DOM
* @returns {Array} Array of chunk objects with text and metadata
*/
function extractSemanticChunks() {
const chunks = [];
let chunkIndex = 0;
const semanticSelectors = [
'h1, h2, h3, h4, h5, h6', 'p', 'li', 'blockquote', 'article',
'section', 'div[role="main"]', 'td, th', 'figcaption', 'summary', 'dd',
];
semanticSelectors.forEach((selector) => {
document.querySelectorAll(selector).forEach((element, index) => {
const text = element.textContent?.trim();
if (text && text.length >= CONFIG.minChunkLength) {
const textChunks = splitLongText(text, CONFIG.maxChunkLength);
textChunks.forEach((chunkText, subIndex) => {
chunks.push({
text: chunkText,
index: chunkIndex ,
metadata: CONFIG.includeMetadata ? {
elementType: element.tagName.toLowerCase(),
elementIndex: index,
subChunkIndex: subIndex,
totalSubChunks: textChunks.length,
xpath: getXPath(element),
textLength: chunkText.length,
} : null,
});
});
}
});
});
if (chunks.length === 0) {
const bodyText = document.body.textContent?.trim();
if (bodyText) {
const textChunks = splitLongText(bodyText, CONFIG.maxChunkLength);
textChunks.forEach((chunkText, index) => {
chunks.push({
text: chunkText,
index: index,
metadata: CONFIG.includeMetadata ? {
elementType: 'body',
elementIndex: 0,
subChunkIndex: index,
totalSubChunks: textChunks.length,
xpath: '/html/body',
textLength: chunkText.length,
} : null,
});
});
}
}
return chunks;
}
/**
* Split long text into smaller chunks while preserving sentence boundaries
*
@param {string} text - Text to split
*
@param {number} maxLength - Maximum length per chunk
* @returns {Array} Array of text chunks
*/
function splitLongText(text, maxLength) {
if (text.length <= maxLength) return [text];
const chunks = [];
const sentences = text.split(/(?<=[.!?])\s /);
let currentChunk = '';
for (const sentence of sentences) {
if ((currentChunk sentence).length <= maxLength) {
currentChunk = (currentChunk ? ' ' : '') sentence;
} else {
if (currentChunk) chunks.push(currentChunk);
if (sentence.length > maxLength) {
const words = sentence.split(' ');
let wordChunk = '';
for (const word of words) {
if ((wordChunk ' ' word).length > maxLength) {
chunks.push(wordChunk);
wordChunk = word;
} else {
wordChunk = (wordChunk ? ' ' : '') word;
}
}
currentChunk = wordChunk;
} else {
currentChunk = sentence;
}
}
}
if (currentChunk) chunks.push(currentChunk);
return chunks.filter(chunk => chunk.length >= CONFIG.minChunkLength);
}
/**
* Get XPath for an element
*
@param {Element} element - DOM element
* @returns {string} XPath string
*/
function getXPath(element) {
if (
element.id) return `//*[
@id="${
element.id}"]`;
const parts = [];
while (element && element.nodeType === Node.ELEMENT_NODE) {
let index = 0;
let sibling = element.previousSibling;
while (sibling) {
if (sibling.nodeType === Node.ELEMENT_NODE && sibling.tagName === element.tagName) {
index ;
}
sibling = sibling.previousSibling;
}
const tagName = element.tagName.toLowerCase();
const pathIndex = index > 0 ? `[${index 1}]` : '';
parts.unshift(`${tagName}${pathIndex}`);
element = element.parentNode;
}
return parts.length ? `/${parts.join('/')}` : '';
}
/**
* Process chunks in batches with retry logic
*
@param {Array} chunks - Array of chunk objects
* @returns {Promise} Promise resolving to embedded chunks
*/
async function processChunksWithRetry(chunks) {
const results = [];
for (let i = 0; i < chunks.length; i = CONFIG.batchSize) {
const batch = chunks.slice(i, i CONFIG.batchSize);
const texts =
batch.map(chunk => chunk.text);
let attempt = 0;
let success = false;
while (attempt < CONFIG.retryAttempts && !success) {
try {
const embeddings = await getEmbeddings(texts);
batch.forEach((chunk, index) => {
results.push({
...chunk,
embedding: embeddings[index],
embeddingModel: MODEL_IDENTIFIER,
processingTimestamp: new Date().toISOString(),
});
});
success = true;
} catch (error) {
attempt ;
console.warn(`Batch ${Math.floor(i / CONFIG.batchSize) 1} attempt ${attempt} failed:`, error.message);
if (attempt >= CONFIG.retryAttempts) {
batch.forEach(chunk => {
results.push({
...chunk,
embedding: null,
error: error.message,
embeddingModel: MODEL_IDENTIFIER,
processingTimestamp: new Date().toISOString(),
});
});
} else {
await new Promise(resolve => setTimeout(resolve, CONFIG.retryDelay * attempt));
}
}
}
}
return results;
}
/**
* Main processing function
* @returns {Promise} Promise resolving to processing results
*/
async function processPageEmbeddings() {
try {
console.log('Extracting semantic chunks from webpage...');
const chunks = extractSemanticChunks();
console.log(`Extracted ${chunks.length} semantic chunks`);
if (chunks.length === 0) {
throw new Error('No content chunks found on the page');
}
console.log(`Processing embeddings using local model ${MODEL_IDENTIFIER}...`);
const embeddedChunks = await processChunksWithRetry(chunks);
const successfulEmbeddings = embeddedChunks.filter(chunk => chunk.embedding !== null).length;
const failedEmbeddings = embeddedChunks.length - successfulEmbeddings;
const result = {
success: true,
model: MODEL_IDENTIFIER,
totalChunks: embeddedChunks.length,
successfulEmbeddings,
failedEmbeddings,
processingTimestamp: new Date().toISOString(),
pageUrl: window.location.href,
pageTitle: document.title,
chunks: embeddedChunks,
summary: {
avgChunkLength: Math.round(embeddedChunks.reduce((sum, chunk) => sum chunk.text.length, 0) / (embeddedChunks.length || 1)),
elementTypes: [...new Set(
embeddedChunks.map(chunk => chunk.metadata?.elementType).filter(Boolean))],
embeddingDimensions: embeddedChunks.find(chunk => chunk.embedding)?.embedding?.length || null,
},
};
console.log(`Processing complete: ${successfulEmbeddings}/${embeddedChunks.length} chunks embedded successfully`);
return result;
} catch (error) {
console.error('Processing failed:', error);
return {
success: false,
error: error.message,
model: MODEL_IDENTIFIER,
processingTimestamp: new Date().toISOString(),
pageUrl: window.location.href,
pageTitle: document.title,
};
}
}
// Execute the main function and return results to Screaming Frog
return processPageEmbeddings()
.then(result =>
seoSpider.data(JSON.stringify(result, null, 2)))
.catch(error => seoSpider.error(`Script execution failed: ${error.message}`));