9 Commits

Author SHA1 Message Date
6153b70c1e new way of scraping 2025-03-24 20:24:30 +01:00
47134525b8 save work 2025-03-24 19:00:26 +01:00
fe177abd85 Merge branch 'main' of https://git.yasue.org/ken/tripadviser_scraper 2025-03-24 15:27:52 +01:00
79ea65c74b save changes 2025-03-24 15:27:16 +01:00
1ffea0f61f auto scroll 2025-03-24 06:57:25 +01:00
b6cad2a241 Merge branch 'main' of https://git.yasue.org/ken/tripadviser_scraper 2025-03-24 06:47:48 +01:00
6f36809932 wip 2025-03-24 06:47:38 +01:00
0e6df98a50 refactor 2025-03-24 06:45:47 +01:00
d0bfe15fa4 wip 2025-03-24 06:24:18 +01:00
5 changed files with 3090 additions and 149 deletions

View File

@ -1,27 +1,4 @@
rank,Latitude,Longitude,Name of City,Country,2021 Population,2020 Population,Growth,Population Difference,Population Change
1,35.6828387,139.7594549,Tokyo,Japan,37339804,37393128,-0.0014,53324,declined
2,28.6517178,77.2219388,Delhi,India,31181376,30290936,0.0294,890440,grew
3,31.2322758,121.4692071,Shanghai,China,27795702,27058480,0.0272,737222,grew
4,-23.5506507,-46.6333824,Sao Paulo,Brazil,22237472,22043028,0.0088,194444,grew
5,19.4326296,-99.1331785,Mexico City,Mexico,21918936,21782378,0.0063,136558,grew
6,23.7861979,90.4026151,Dhaka,Bangladesh,21741090,21005860,0.035,735230,grew
7,30.0443879,31.2357257,Cairo,Egypt,21322750,20900604,0.0202,422146,grew
8,39.906217,116.3912757,Beijing,China,20896820,20462610,0.0212,434210,grew
9,19.0759899,72.8773928,Mumbai,India,20667656,20411274,0.0126,256382,grew
10,34.6198813,135.490357,Osaka,Japan,19110616,19165340,-0.0029,54724,declined
11,24.8546842,67.0207055,Karachi,Pakistan,16459472,16093786,0.0227,365686,grew
12,29.5647398,106.5478767,Chongqing,China,16382376,15872179,0.0321,510197,grew
13,41.0096334,28.9651646,Istanbul,Turkey,15415197,15190336,0.0148,224861,grew
14,-34.6075682,-58.4370894,Buenos Aires,Argentina,15257673,15153729,0.0069,103944,grew
15,22.5414185,88.3576912,Kolkata,India,14974073,14850066,0.0084,124007,grew
16,-4.3217055,15.3125974,Kinshasa,DR Congo,14970460,14342439,0.0438,628021,grew
17,6.4550575,3.3941795,Lagos,Nigeria,14862111,14368332,0.0344,493779,grew
18,14.5907332,120.9809674,Manila,Philippines,14158573,13923452,0.0169,235121,grew
19,39.0856735,117.1951073,Tianjin,China,13794450,13589078,0.0151,205372,grew
20,23.1301964,113.2592945,Guangzhou,China,13635397,13301532,0.0251,333865,grew
21,-22.9110137,-43.2093727,Rio de Janeiro,Brazil,13544462,13458075,0.0064,86387,grew
22,31.5656822,74.3141829,Lahore,Pakistan,13095166,12642423,0.0358,452743,grew
23,12.9767936,77.590082,Bangalore,India,12764935,12326532,0.0356,438403,grew
24,55.7504461,37.6174943,Moscow,Russia,12593252,12537954,0.0044,55298,grew
25,22.555454,114.0543297,Shenzhen,China,12591696,12356820,0.019,234876,grew
26,13.0836939,80.270186,Chennai,India,11235018,10971108,0.0241,263910,grew

1 rank Latitude Longitude Name of City Country 2021 Population 2020 Population Growth Population Difference Population Change
1 35.6828387 139.7594549 Tokyo Japan 37339804 37393128 -0.0014 53324 declined
2 28.6517178 77.2219388 Delhi India 31181376 30290936 0.0294 890440 grew
3 31.2322758 121.4692071 Shanghai China 27795702 27058480 0.0272 737222 grew
4 -23.5506507 -46.6333824 Sao Paulo Brazil 22237472 22043028 0.0088 194444 grew
5 19.4326296 -99.1331785 Mexico City Mexico 21918936 21782378 0.0063 136558 grew
6 23.7861979 90.4026151 Dhaka Bangladesh 21741090 21005860 0.035 735230 grew
7 30.0443879 31.2357257 Cairo Egypt 21322750 20900604 0.0202 422146 grew
8 39.906217 116.3912757 Beijing China 20896820 20462610 0.0212 434210 grew
9 19.0759899 72.8773928 Mumbai India 20667656 20411274 0.0126 256382 grew
10 34.6198813 135.490357 Osaka Japan 19110616 19165340 -0.0029 54724 declined
11 24.8546842 67.0207055 Karachi Pakistan 16459472 16093786 0.0227 365686 grew
12 29.5647398 106.5478767 Chongqing China 16382376 15872179 0.0321 510197 grew
13 41.0096334 28.9651646 Istanbul Turkey 15415197 15190336 0.0148 224861 grew
14 -34.6075682 -58.4370894 Buenos Aires Argentina 15257673 15153729 0.0069 103944 grew
15 22.5414185 88.3576912 Kolkata India 14974073 14850066 0.0084 124007 grew
16 -4.3217055 15.3125974 Kinshasa DR Congo 14970460 14342439 0.0438 628021 grew
17 6.4550575 3.3941795 Lagos Nigeria 14862111 14368332 0.0344 493779 grew
18 14.5907332 120.9809674 Manila Philippines 14158573 13923452 0.0169 235121 grew
19 39.0856735 117.1951073 Tianjin China 13794450 13589078 0.0151 205372 grew
20 23.1301964 113.2592945 Guangzhou China 13635397 13301532 0.0251 333865 grew
21 -22.9110137 -43.2093727 Rio de Janeiro Brazil 13544462 13458075 0.0064 86387 grew
22 31.5656822 74.3141829 Lahore Pakistan 13095166 12642423 0.0358 452743 grew
23 12.9767936 77.590082 Bangalore India 12764935 12326532 0.0356 438403 grew
2 24 55.7504461 37.6174943 Moscow Russia 12593252 12537954 0.0044 55298 grew
3 25 22.555454 114.0543297 Shenzhen China 12591696 12356820 0.019 234876 grew
4 26 13.0836939 80.270186 Chennai India 11235018 10971108 0.0241 263910 grew

2812
data/contact_info.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -9,65 +9,27 @@ import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome';
import * as fs from 'fs';
import * as path from 'path';
import { getCities } from './lib/cities';
import { WebDriverUtils, saveContactInfoToCSV } from './lib/utils';
import { WebDriverUtils, saveContactInfoToCSV, useExistingChrome, disableCookiesInChrome, useChrome } from './lib/utils';
import * as UIActions from './lib/UIActions';
import { randomUUID } from 'crypto';
import os from 'os';
import edge from 'selenium-webdriver/edge';
/**
* Function to visit TripAdvisor pages for each city
*/
async function visitCityPages(): Promise<void> {
const userHomeDir = os.homedir(); // gets C:\Users\<YourName>
const driverPath = path.join(userHomeDir, 'Documents', 'edgedriver_win64', 'msedgedriver.exe');
// Configure Edge service to use your custom driver path
const service = new edge.ServiceBuilder(driverPath);
const options = new edge.Options();
options.addArguments('--inprivate');
options.addArguments('--start-maximized');
let driver: WebDriver;
driver = await new Builder()
.forBrowser('MicrosoftEdge')
.setEdgeOptions(options)
.setEdgeService(service)
.build();
await driver.get('https://www.tripadvisor.com');
await WebDriverUtils.wait(5);
/*
// Connect to an existing Chrome browser running in debug mode on port 9222
const options = new chrome.Options();
// Set the debugger address to connect to the existing Chrome instance
options.debuggerAddress('localhost:9222');
// Create WebDriver instance that connects to the existing browser
const driver: WebDriver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
*/
const cities = getCities(path.join(__dirname, '../data/cities.csv'));
console.log('Connecting to existing Chrome browser...');
const driver = await useExistingChrome();
if (!driver) return;
// Visit each city's TripAdvisor page
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`[${i + 1}/${cities.length}] Visiting TripAdvisor page for ${city}...`);
let originalWindow;
let cityTopWindow;
let attactionsWindow;
let museumWindow;
try {
@ -75,107 +37,65 @@ async function visitCityPages(): Promise<void> {
console.log("Logo click")
if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5);
await WebDriverUtils.wait(driver);
console.log("Exec Search")
if (!await UIActions.execSearch(driver, city)) throw `${city} failed`;
await WebDriverUtils.wait(5);
console.log("Click See all")
if (!await UIActions.clickSeeAll(driver)) {
if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`;
if (!await UIActions.clickSeeAll(driver)) throw `${city} failed`;
}
await WebDriverUtils.wait(5);
console.log("Switch tab")
let windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow) {
cityTopWindow = handle;
await driver.switchTo().window(handle);
}
}
console.log("Click See all attractions")
if (!await UIActions.clickSeeAllAttractions(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5);
console.log("Switch tab to Attraction")
windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow) {
attactionsWindow = handle;
await driver.switchTo().window(attactionsWindow);
}
}
// click museum
console.log("Click Museum link");
if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5);
if (!await UIActions.execSearch(driver, `${city} museum`)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
let page = 1;
let museumNames: string[] = [];
while (1) {
// get list of museums
console.log("Get list of museums");
const museumElms = await UIActions.getMusiums(driver);
await WebDriverUtils.wait(1);
for (const listItem of museumElms) {
await listItem.click();
await WebDriverUtils.wait(3);
windows = await driver.getAllWindowHandles();
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow && handle !== attactionsWindow) {
museumWindow = handle;
await driver.switchTo().window(museumWindow);
}
}
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
console.log(`${websiteUrl} / ${email}`);
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
museumWindow && await driver.switchTo().window(museumWindow);
await driver.close();
await WebDriverUtils.wait(1);
attactionsWindow && await driver.switchTo().window(attactionsWindow);
await WebDriverUtils.wait(1);
}
const tmpMuseumNames = await UIActions.getAttractionNames(driver);
await WebDriverUtils.wait(driver);
museumNames = [...museumNames, ...tmpMuseumNames]
page++;
if (page > 10) break;
UIActions.clickPagination(driver, page);
await WebDriverUtils.wait(5);
await WebDriverUtils.wait(driver);
}
for (const museumName of museumNames) {
console.log("back to home");
if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
if (!await UIActions.typeSearch(driver, `${city} ${museumName}`)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
if (!await UIActions.clickFirstAttractionLinkInForm(driver)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
console.log(`${websiteUrl} / ${email}`);
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
}
console.log(museumNames);
await UIActions.closeAllTabsExceptFirst(driver);
if (i < cities.length - 1) {
console.log(`Waiting for 5000 seconds before next city...`);
await WebDriverUtils.wait(5); // Wait 5000 seconds before next city
await WebDriverUtils.wait(); // Wait 5000 seconds before next city
}
} catch (error) {
await UIActions.closeAllTabsExceptFirst(driver);
} catch (error) {
// If the button is not found within the timeout, log and continue to the next city
console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`);
await WebDriverUtils.wait(5); // Wait 5 seconds before next city
await WebDriverUtils.wait(); // Wait 5 seconds before next city
}
}

View File

@ -8,7 +8,7 @@ import { ContactInfo } from './types';
export async function execSearch(driver: WebDriver, city: string): Promise<boolean> {
export async function execSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
try {
// Find the search input field
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
@ -18,12 +18,12 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
// Clear any existing text and enter the city name
await searchInput.clear();
await searchInput.sendKeys(city);
await searchInput.sendKeys(searchTerm);
// Submit the search (press Enter)
await WebDriverUtils.wait(2);
await WebDriverUtils.wait(driver);
await searchInput.sendKeys('\uE007'); // Unicode for Enter key
await WebDriverUtils.wait(5); // Wait 5 seconds before next city
await WebDriverUtils.wait(driver); // Wait 5 seconds before next city
return true;
} catch (e) {
@ -31,6 +31,25 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
}
}
export async function typeSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
try {
// Find the search input field
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
await WebDriverUtils.waitForElement(driver, searchSelector, 10000);
console.log("Search box found");
const searchInput = await driver.findElement(By.css(searchSelector));
// Clear any existing text and enter the city name
await searchInput.clear();
await searchInput.sendKeys(searchTerm);
return true;
} catch (e) {
return false;
}
}
export async function clickSeeAll(driver: WebDriver): Promise<boolean> {
try {
const seeAllElement = await driver.wait(
@ -49,6 +68,21 @@ export async function clickSeeAll(driver: WebDriver): Promise<boolean> {
}
}
export async function getSeeAllUrl(driver: WebDriver): Promise<string> {
const xpath = `//h3[normalize-space(.)='Things to do']/ancestor::div[1]//a[starts-with(@href, '/Attractions')]`;
try {
const anchor = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
const url = await anchor.getAttribute('href');
console.log('Found Attractions URL:', url);
return url;
} catch (err) {
console.warn('Could not find the Attractions link:', err);
}
return "";
}
export async function gotoHome(driver: WebDriver): Promise<boolean> {
try {
// Click on the Tripadvisor logo before searching for the city
@ -65,7 +99,7 @@ export async function gotoHome(driver: WebDriver): Promise<boolean> {
}
export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean> {
const xpath = `//h2[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
const xpath = `//h3[starts-with(normalize-space(.), 'Things to do')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
@ -82,6 +116,22 @@ export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean
}
}
export async function getSeeAllAttractionsUrl(driver: WebDriver): Promise<string | null> {
const xpath = `//h3[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(anchorElement), 5000);
const href = await anchorElement.getAttribute('href');
return href;
} catch (error) {
console.warn('Element not found or href not retrievable.', error);
return null;
}
}
export async function clickMuseumsLink(driver: WebDriver): Promise<boolean> {
const xpath = `//a[.//*[normalize-space(.)='Museums']]`;
@ -130,6 +180,49 @@ export async function getMusiums(driver: WebDriver): Promise<WebElement[]> {
}
}
export async function getMuseumsLinks(driver: WebDriver): Promise<string[]> {
const xpath = `//div//section[.//a[starts-with(@href, '/Attraction')] and .//h3]//a[starts-with(@href, '/Attraction') and .//img]`;
try {
const links = await driver.findElements(By.xpath(xpath));
const urls: string[] = [];
for (const link of links) {
const href = await link.getAttribute('href');
if (href) {
urls.push(href);
}
}
return urls;
} catch (error) {
console.warn('Error getting attraction URLs:', error);
return [];
}
}
export async function getAttractionNames(driver: WebDriver): Promise<string[]> {
// XPath to find <h3> inside <a> whose href starts with /Attraction_Review
const xpath = `//a[starts-with(@href, '/Attraction_Review')]/h3`;
try {
const h3Elements = await driver.findElements(By.xpath(xpath));
const names: string[] = [];
for (const h3 of h3Elements) {
const text = await h3.getText();
if (text) {
names.push(text.trim().replace(/^\d+\.\s*/, ''));
}
}
return names;
} catch (error) {
console.warn('Error getting attraction names:', error);
return [];
}
}
export async function getWebsiteAndEmail(driver: WebDriver): Promise<ContactInfo> {
const result: ContactInfo = { websiteUrl: null, email: null };
@ -193,4 +286,24 @@ export async function closeAllTabsExceptFirst(driver: WebDriver): Promise<void>
await driver.switchTo().window(originalHandle);
console.log(`Switched back to original tab: ${originalHandle}`);
}
}
export async function clickFirstAttractionLinkInForm(driver: WebDriver): Promise<boolean> {
const xpath = `//form//a[starts-with(@href, '/Attraction')]`;
try {
// Wait for the link to appear inside a form
const link = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(link), 5000);
await driver.wait(until.elementIsEnabled(link), 5000);
await link.click();
console.log('Clicked the first attraction link in the form.');
return true;
} catch (error) {
console.warn('Attraction link not found or not clickable.', error);
return false;
}
}

View File

@ -2,7 +2,8 @@
* Utility class for common WebDriver operations
*/
import { WebDriver, By, until } from 'selenium-webdriver';
import { Builder, By, until, WebDriver } from 'selenium-webdriver';
import chrome from 'selenium-webdriver/chrome';
import { writeFileSync, existsSync, appendFileSync } from 'fs';
import * as path from 'path';
import { ContactInfo } from './types';
@ -13,9 +14,33 @@ export class WebDriverUtils {
* @param seconds Number of seconds to wait
* @returns Promise that resolves after the specified time
*/
static async wait(seconds: number): Promise<void> {
console.log(`Waiting for ${seconds} seconds...`);
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
static async wait(driver?: WebDriver): Promise<void> {
const seconds = Math.floor(Math.random() * 1000) % 3 + 3;
console.log(`Scrolling to bottom for ${seconds} seconds...`);
const endTime = Date.now() + seconds * 1000;
let scrollCounter = 0;
while (Date.now() < endTime) {
/*
try {
if (driver) {
await driver.executeScript(`
window.scrollBy(0, 10);
`);
scrollCounter++;
}
} catch (error) {
console.warn('Scroll failed:', error);
}
*/
// Wait a little between scrolls
await new Promise(resolve => setTimeout(resolve, 500));
}
}
/**
@ -43,3 +68,97 @@ export function saveContactInfoToCSV(city: string, contactInfo: ContactInfo, fil
console.log(`Contact info saved to ${filePath}`);
}
export async function disableCookiesInChrome(): Promise<WebDriver | null> {
// Set Chrome options
const options = new chrome.Options();
// 1. Block all cookies
//options.setUserPreferences({
// 'profile.default_content_setting_values.cookies': 2, // 2 = Block all
// 'profile.block_third_party_cookies': true
//});
// 2. Optional: Launch in incognito for extra privacy
options.addArguments('--incognito');
options.addArguments('--start-maximized');
let driver: WebDriver | null = null;
try {
driver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
await driver.get('https://www.tripadvisor.com/');
console.log('Chrome launched with cookies disabled.');
// Optional: Verify cookies are blocked by trying to set/get a cookie
await driver.manage().addCookie({ name: 'test', value: '123' });
const cookies = await driver.manage().getCookies();
console.log('Cookies after trying to add:', cookies); // Should be empty or restricted
return driver;
} catch (error) {
console.error('Error:', error);
return driver;
}
}
export async function useChrome(initialUrl: string = "https://www.tripadvisor.com/"): Promise<WebDriver | null> {
// Set Chrome options
const options = new chrome.Options();
// 1. Block all cookies
//options.setUserPreferences({
// 'profile.default_content_setting_values.cookies': 2, // 2 = Block all
// 'profile.block_third_party_cookies': true
//});
// 2. Optional: Launch in incognito for extra privacy
options.addArguments('--incognito');
options.addArguments('--start-maximized');
let driver: WebDriver | null = null;
try {
driver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
await driver.get(initialUrl);
console.log('Chrome launched with cookies disabled.');
// Optional: Verify cookies are blocked by trying to set/get a cookie
await driver.manage().addCookie({ name: 'test', value: '123' });
const cookies = await driver.manage().getCookies();
console.log('Cookies after trying to add:', cookies); // Should be empty or restricted
return driver;
} catch (error) {
console.error('Error:', error);
return driver;
}
}
export async function useExistingChrome(): Promise<WebDriver> {
// Connect to an existing Chrome browser running in debug mode on port 9222
const options = new chrome.Options();
// Set the debugger address to connect to the existing Chrome instance
options.debuggerAddress('localhost:9222');
// Create WebDriver instance that connects to the existing browser
const driver: WebDriver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
return driver;
}