Bulk Import URLs & Meta Data into Google Sheets for SEO

SEOWeb Development

Author: Mike Ciffone

Published: 01.08.2024 / Updated: 01.08.2024

To use this script start by cloning my template file. Then navigate to the Extensions item in the Sheets menu and click Apps Script. Copy and paste the code below. Go back to your spreadsheet and add your sitemap link to Cell B1. It can be either an individual sitemap file or a sitemap_index file.

Once you’ve connected the script, a new item labeled “Sitemap” will appear in the sheets menu. In the dropdown there will be a menu item to “Get URLs & Meta”. Use that to run the script directly from sheets.

I have only tested the sitemap_index approach on sites that use Yoast SEO or RankMath – if you encounter an issue using others, message me on LinkedIn or X and I’ll update it. FYI if you refresh the page for some reason Sheets throws an error message like this:

Error fetching sitemap: You do not have permission to call UrlFetchApp.fetch. Required permissions: https://www.googleapis.com/auth/script.external_request

It will go away after a second then you’ll see the menu item appear. I don’t know how to suppress that.

You can duplicate the sheet and put individual sitemaps in each sheet as well for better organization. This can also make it run faster for larger sites. The largest site that I tested the sitemap_index approach on had about was ~2k URLs across 6 sitemaps.

Also, while they’ve been commented out, I left the console logs that I used for debugging in the code in case anyone wants to tinker with it.

Credit to Daniel Foley Carter, for creating the getRelevanceScore function.



function onOpen() {
    const ui = SpreadsheetApp.getUi();
    ui.createMenu('Sitemap')
	.addItem('Import URLs & Meta', 'initSitemapUrl')
	.addToUi();
}

const state = {
    sheet: SpreadsheetApp.getActiveSpreadsheet().getActiveSheet(),
    totalUrlsProcessed: 0
};

function showAlert(message) {
    const ui = SpreadsheetApp.getUi();
    ui.alert(message);
}

function decodeHtmlEntities(text) {
    const entities = {
	'&lt;': '<',
	'&gt;': '>',
	'&amp;': '&',
	'&quot;': '"',
	'&#039;': "'",
	// Add more entities here as needed
	};
    return text.replace(/&[^;]+;/g, match => entities[match] || match);
}

function getRelevanceScore(title, metaDescription, heading) {

    // Convert to lowercase
    title = title.toLowerCase();
    metaDescription = metaDescription.toLowerCase();
    heading = heading.toLowerCase();

    // Split into words
    const titleWords = title.split(/\s+/);
    const metaWords = metaDescription.split(/\s+/);
    const headingWords = heading.split(/\s+/);

    // Find common words among title, meta description, and h1
    const commonWords = titleWords.filter(value => metaWords.includes(value) && headingWords.includes(value));

    // Calculate relevance based on common words
    const minLength = Math.min(titleWords.length, metaWords.length, headingWords.length);
    if (commonWords.length > minLength / 2) {
        return "Highly relevant";
    } else if (commonWords.length > 2) {
        return "Relevant";
    } else if (commonWords.length > 0) {
        return "Less Relevant";
    } else {
        return "Completely Irrelevant";
    }
}

function extractTagContent(html, startTag, endTag) {
    const startIndex = html.indexOf(startTag);
    if (startIndex !== -1) {
	const endIndex = html.indexOf(endTag, startIndex + startTag.length);
		
	if (endIndex !== -1) {
	    let content = html.substring(startIndex + startTag.length, endIndex).trim();
	    content = content.replace(/<[^>]*>/g, "");
	    content = decodeHtmlEntities(content);
	    return content;
	}
    }

    return "";
}

function getMetaData(data, index = 0) {
    if (index >= data.length) {
	//console.log("All URLs processed in current sitemap");
	state.totalUrlsProcessed += data.length;  // Update the total count of processed URLs

	return;
    }

    const url = data[index][0];
    //console.log("Processing URL:", url);

    try {
	const response = UrlFetchApp.fetch(url, { 'muteHttpExceptions': true });
	const htmlContent = response.getContentText();
	const title = extractTagContent(htmlContent, '<title>', '</title>');
	const metaDescription = extractTagContent(htmlContent, '<meta name="description" content="', '"');
	const h1 = extractTagContent(htmlContent, '<h1>', '</h1>');

	// Write back to the sheet
	const startRow = state.totalUrlsProcessed + 3;  // Adjust the start row based on totalUrlsProcessed
	state.sheet.getRange(startRow + index, 2).setValue(title);
	state.sheet.getRange(startRow + index, 3).setValue(metaDescription);
	state.sheet.getRange(startRow + index, 4).setValue(h1);

	// Calculate and set the relevance score
	const relevanceScore = getRelevanceScore(title, metaDescription, h1);
	state.sheet.getRange(startRow + index, 5).setValue(relevanceScore); // Assuming column E (5) is for relevance score
 
	// Process the next URL
	getMetaData(data, index + 1);

	} catch (e) {
	    //console.error('Error fetching URL: ' + url + ', Error: ' + e);
	    const startRow = state.totalUrlsProcessed + 3;
	    state.sheet.getRange(startRow + index, 2).setValue('Error');
	    state.sheet.getRange(startRow + index, 3).setValue('Error');
	    state.sheet.getRange(startRow + index, 4).setValue('Error');

	// Process the next URL
	getMetaData(data, index + 1);
    }
}

function populateUrls(data) {
    const sheet = state.sheet;
    //console.log("Populating URLs to the sheet, URL count:", data.length);

    if (data.length > 0) {

	// Find the last row with data in column 1 (URLs column)
	const lastRow = sheet.getLastRow();
	const startRow = lastRow + 1; // Start writing from the next empty row

	sheet.getRange(startRow, 1, data.length, 1).setValues(data);
	//console.log("URLs populated, starting at row:", startRow);

	} else {
	    //console.log("No URLs to populate");
	}

	getMetaData(data); // Call getMetaData after populating URLs
}

function parseUrls(urls, namespace) {
    const data = [];
    const processedUrlsSet = new Set(); // Set to track already processed URLs

    urls.forEach((urlElement, index) => {
	const locElement = urlElement.getChild('loc', namespace);
	
        if (locElement) {
	    const loc = locElement.getText();

	    // Check if URL has already been processed
	    if (!loc.endsWith("locations.kml") && !processedUrlsSet.has(loc)) {
		processedUrlsSet.add(loc);
		data.push([loc]);
	    }
	} 
	});

	populateUrls(data);
}

function getSitemap(siteMapUrl) {
    try {
	const response = UrlFetchApp.fetch(siteMapUrl, {muteHttpExceptions: true});
	if (response.getResponseCode() === 200) {
		const xml = response.getContentText();
		importUrls(xml);
	}
	else {
	    showAlert(`Sitemap not found (HTTP response: ${response.getResponseCode()})`);
		}
	} catch (e) {
	    showAlert(`Error fetching sitemap: ${e.message}`);
	}
}

function initSitemapUrl() {
    const siteMapUrl = state.sheet.getRange(1,2).getValue();
    if (siteMapUrl) {
	getSitemap(siteMapUrl);
    }
    else {
	showAlert(`Enter a sitemap URL in Cell B1`);
    }
}

function importUrls(xml) {
    const document = XmlService.parse(xml);
    const root = document.getRootElement();
    const namespace = root.getNamespace();

    if (root.getName() === 'sitemapindex') {
	const sitemaps = root.getChildren('sitemap', namespace);

	//console.log("Sitemap index found with", sitemaps.length, "sitemaps");
	sitemaps.forEach((sitemapElement) => {
	    const loc = sitemapElement.getChild('loc', namespace).getText();
	
            //console.log("Sitemap URL:", loc);
	    fetchAndProcessSitemap(loc);
	});
	} else if (root.getName() === 'urlset') {
	    //console.log("Individual sitemap found");
	    const urls = root.getChildren('url', namespace);

	    parseUrls(urls, namespace);
	} else {
	    //console.log("Unknown XML format");
	}
}

function fetchAndProcessSitemap(sitemapUrl) {
    try {
	const response = UrlFetchApp.fetch(sitemapUrl, { 'muteHttpExceptions': true });
	const xmlContent = response.getContentText();
	importUrls(xmlContent); // Recursively process the sitemap
    } catch (e) {
	//console.error('Error fetching or processing sitemap:', sitemapUrl, 'Error:', e);
	showAlert('Error fetching or processing sitemap:', sitemapUrl, 'Error:', e);
	return;
     }
}
initSitemapUrl();

Here’s a breakdown of each function:

onOpen()

Triggered when the Google Sheet is opened.
Creates a custom menu in the Google Sheets UI titled ‘Sitemap’.
Adds an item to this menu (‘Import URLs & Meta’) which, when clicked, triggers the initSitemapUrl() function.

initSitemapUrl()

Retrieves the sitemap URL entered in a specific cell (B1) of the active spreadsheet.
Calls getSitemap(siteMapUrl) to begin processing the sitemap.

getSitemap(siteMapUrl)

Fetches the sitemap XML from the provided URL.
If successful, it passes the XML content to importUrls(xml) for further processing.

importUrls(xml)

Parses the XML content of the sitemap.
Determines if the XML represents a sitemap index (a collection of sitemaps) or a single sitemap (urlset).
If it’s a sitemap index, it iteratively processes each sitemap by calling fetchAndProcessSitemap(loc) for each sitemap URL.
If it’s a single sitemap, it extracts URLs using parseUrls(urls, namespace).

fetchAndProcessSitemap(sitemapUrl)

Fetches and processes an individual sitemap URL encountered in a sitemap index.
Recursively calls importUrls(xmlContent) to handle the sitemap content.

parseUrls(urls, namespace)

Extracts individual URLs from a sitemap.
Checks for duplicates and filters out any URLs ending with “locations.kml”.
Calls populateUrls(data) to write the URLs to the spreadsheet.

populateUrls(data)

Writes the extracted URLs to the spreadsheet, starting from the next empty row after the last populated row.
After writing the URLs, it initiates metadata extraction for each URL by calling getMetaData(data).

getMetaData(data, index)

Sequentially processes each URL in the provided data array.
For each URL, it fetches the page content and extracts the title, meta description, and the first H1 tag.
Writes this information to the respective rows in the spreadsheet.
Calculates and writes a relevance score based on the commonality of words between the title, meta description, and H1 tag.
Recursively calls itself to process the next URL in the list.

decodeHtmlEntities(text)

Converts HTML entities in a string (like &, <, >) to their corresponding characters.

extractTagContent(html, startTag, endTag)

Extracts content from within specified start and end HTML tags.
Also cleans up the extracted content by removing any nested HTML tags and decoding HTML entities.

getRelevanceScore(title, metaDescription, heading)

Analyzes the title, meta description, and H1 tag to calculate a relevance score.
The score is based on the number of common words among these three elements.

showAlert(message)

Displays an alert box in the Google Sheets UI with a provided message.
Each function is designed to handle a specific part of the process, from initiating the script, parsing sitemap XML, extracting URLs, fetching page content, to analyzing and displaying the results in a structured format on the spreadsheet. This modular approach makes the script efficient and maintainable.


Practical Application in SEO

In the current SEO landscape, especially with GSE on the rise, understanding the minute details of how on-page elements interact is crucial. This script provides a quick and easy way to get a granular view of a few of the main on-page elements for each page.

The script is particularly useful when examining pages bulk to identify where on-page elements are not effectively aligned or where they don’t exist. For instance, a perfectly optimized title tag might be undermined by a poorly crafted H1 tag. Some pages might be missing an h1 tag. This tool allows you to quickly pinpoint such discrepancies and address them to ensure that all elements contribute positively toward your SEO.