8 Commits

Author SHA1 Message Date
607bb07520 not finished 2025-03-24 15:47:10 +01:00
fe177abd85 Merge branch 'main' of https://git.yasue.org/ken/tripadviser_scraper 2025-03-24 15:27:52 +01:00
79ea65c74b save changes 2025-03-24 15:27:16 +01:00
1ffea0f61f auto scroll 2025-03-24 06:57:25 +01:00
b6cad2a241 Merge branch 'main' of https://git.yasue.org/ken/tripadviser_scraper 2025-03-24 06:47:48 +01:00
6f36809932 wip 2025-03-24 06:47:38 +01:00
0e6df98a50 refactor 2025-03-24 06:45:47 +01:00
d0bfe15fa4 wip 2025-03-24 06:24:18 +01:00
5 changed files with 2993 additions and 56 deletions

View File

@ -1,27 +1,4 @@
rank,Latitude,Longitude,Name of City,Country,2021 Population,2020 Population,Growth,Population Difference,Population Change rank,Latitude,Longitude,Name of City,Country,2021 Population,2020 Population,Growth,Population Difference,Population Change
1,35.6828387,139.7594549,Tokyo,Japan,37339804,37393128,-0.0014,53324,declined
2,28.6517178,77.2219388,Delhi,India,31181376,30290936,0.0294,890440,grew
3,31.2322758,121.4692071,Shanghai,China,27795702,27058480,0.0272,737222,grew
4,-23.5506507,-46.6333824,Sao Paulo,Brazil,22237472,22043028,0.0088,194444,grew
5,19.4326296,-99.1331785,Mexico City,Mexico,21918936,21782378,0.0063,136558,grew
6,23.7861979,90.4026151,Dhaka,Bangladesh,21741090,21005860,0.035,735230,grew
7,30.0443879,31.2357257,Cairo,Egypt,21322750,20900604,0.0202,422146,grew
8,39.906217,116.3912757,Beijing,China,20896820,20462610,0.0212,434210,grew
9,19.0759899,72.8773928,Mumbai,India,20667656,20411274,0.0126,256382,grew
10,34.6198813,135.490357,Osaka,Japan,19110616,19165340,-0.0029,54724,declined
11,24.8546842,67.0207055,Karachi,Pakistan,16459472,16093786,0.0227,365686,grew
12,29.5647398,106.5478767,Chongqing,China,16382376,15872179,0.0321,510197,grew
13,41.0096334,28.9651646,Istanbul,Turkey,15415197,15190336,0.0148,224861,grew
14,-34.6075682,-58.4370894,Buenos Aires,Argentina,15257673,15153729,0.0069,103944,grew
15,22.5414185,88.3576912,Kolkata,India,14974073,14850066,0.0084,124007,grew
16,-4.3217055,15.3125974,Kinshasa,DR Congo,14970460,14342439,0.0438,628021,grew
17,6.4550575,3.3941795,Lagos,Nigeria,14862111,14368332,0.0344,493779,grew
18,14.5907332,120.9809674,Manila,Philippines,14158573,13923452,0.0169,235121,grew
19,39.0856735,117.1951073,Tianjin,China,13794450,13589078,0.0151,205372,grew
20,23.1301964,113.2592945,Guangzhou,China,13635397,13301532,0.0251,333865,grew
21,-22.9110137,-43.2093727,Rio de Janeiro,Brazil,13544462,13458075,0.0064,86387,grew
22,31.5656822,74.3141829,Lahore,Pakistan,13095166,12642423,0.0358,452743,grew
23,12.9767936,77.590082,Bangalore,India,12764935,12326532,0.0356,438403,grew
24,55.7504461,37.6174943,Moscow,Russia,12593252,12537954,0.0044,55298,grew 24,55.7504461,37.6174943,Moscow,Russia,12593252,12537954,0.0044,55298,grew
25,22.555454,114.0543297,Shenzhen,China,12591696,12356820,0.019,234876,grew 25,22.555454,114.0543297,Shenzhen,China,12591696,12356820,0.019,234876,grew
26,13.0836939,80.270186,Chennai,India,11235018,10971108,0.0241,263910,grew 26,13.0836939,80.270186,Chennai,India,11235018,10971108,0.0241,263910,grew

1 rank Latitude Longitude Name of City Country 2021 Population 2020 Population Growth Population Difference Population Change
1 35.6828387 139.7594549 Tokyo Japan 37339804 37393128 -0.0014 53324 declined
2 28.6517178 77.2219388 Delhi India 31181376 30290936 0.0294 890440 grew
3 31.2322758 121.4692071 Shanghai China 27795702 27058480 0.0272 737222 grew
4 -23.5506507 -46.6333824 Sao Paulo Brazil 22237472 22043028 0.0088 194444 grew
5 19.4326296 -99.1331785 Mexico City Mexico 21918936 21782378 0.0063 136558 grew
6 23.7861979 90.4026151 Dhaka Bangladesh 21741090 21005860 0.035 735230 grew
7 30.0443879 31.2357257 Cairo Egypt 21322750 20900604 0.0202 422146 grew
8 39.906217 116.3912757 Beijing China 20896820 20462610 0.0212 434210 grew
9 19.0759899 72.8773928 Mumbai India 20667656 20411274 0.0126 256382 grew
10 34.6198813 135.490357 Osaka Japan 19110616 19165340 -0.0029 54724 declined
11 24.8546842 67.0207055 Karachi Pakistan 16459472 16093786 0.0227 365686 grew
12 29.5647398 106.5478767 Chongqing China 16382376 15872179 0.0321 510197 grew
13 41.0096334 28.9651646 Istanbul Turkey 15415197 15190336 0.0148 224861 grew
14 -34.6075682 -58.4370894 Buenos Aires Argentina 15257673 15153729 0.0069 103944 grew
15 22.5414185 88.3576912 Kolkata India 14974073 14850066 0.0084 124007 grew
16 -4.3217055 15.3125974 Kinshasa DR Congo 14970460 14342439 0.0438 628021 grew
17 6.4550575 3.3941795 Lagos Nigeria 14862111 14368332 0.0344 493779 grew
18 14.5907332 120.9809674 Manila Philippines 14158573 13923452 0.0169 235121 grew
19 39.0856735 117.1951073 Tianjin China 13794450 13589078 0.0151 205372 grew
20 23.1301964 113.2592945 Guangzhou China 13635397 13301532 0.0251 333865 grew
21 -22.9110137 -43.2093727 Rio de Janeiro Brazil 13544462 13458075 0.0064 86387 grew
22 31.5656822 74.3141829 Lahore Pakistan 13095166 12642423 0.0358 452743 grew
23 12.9767936 77.590082 Bangalore India 12764935 12326532 0.0356 438403 grew
2 24 55.7504461 37.6174943 Moscow Russia 12593252 12537954 0.0044 55298 grew
3 25 22.555454 114.0543297 Shenzhen China 12591696 12356820 0.019 234876 grew
4 26 13.0836939 80.270186 Chennai India 11235018 10971108 0.0241 263910 grew

2801
data/contact_info.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@ import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import { getCities } from './lib/cities'; import { getCities } from './lib/cities';
import { WebDriverUtils, saveContactInfoToCSV } from './lib/utils'; import { WebDriverUtils, saveContactInfoToCSV, useExistingChrome, disableCookiesInChrome, useChrome } from './lib/utils';
import * as UIActions from './lib/UIActions'; import * as UIActions from './lib/UIActions';
import { randomUUID } from 'crypto'; import { randomUUID } from 'crypto';
@ -23,17 +23,8 @@ async function visitCityPages(): Promise<void> {
console.log('Connecting to existing Chrome browser...'); console.log('Connecting to existing Chrome browser...');
// Connect to an existing Chrome browser running in debug mode on port 9222 const driver = await useChrome();
const options = new chrome.Options(); if (!driver) return;
// Set the debugger address to connect to the existing Chrome instance
options.debuggerAddress('localhost:9222');
// Create WebDriver instance that connects to the existing browser
const driver: WebDriver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
// Visit each city's TripAdvisor page // Visit each city's TripAdvisor page
for (let i = 0; i < cities.length; i++) { for (let i = 0; i < cities.length; i++) {
@ -50,18 +41,33 @@ async function visitCityPages(): Promise<void> {
console.log("Logo click") console.log("Logo click")
if (!await UIActions.gotoHome(driver)) throw `${city} failed`; if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5); await WebDriverUtils.wait(driver);
console.log("Exec Search") console.log("Exec Search")
if (!await UIActions.execSearch(driver, city)) throw `${city} failed`; if (!await UIActions.execSearch(driver, city)) throw `${city} failed`;
await WebDriverUtils.wait(5); await WebDriverUtils.wait(driver);
console.log("Click See all") console.log("Click See all")
if (!await UIActions.clickSeeAll(driver)) { let seeAllUrl = await UIActions.getSeeAllUrl(driver);
if (seeAllUrl.length == 0) {
if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`; if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`;
if (!await UIActions.clickSeeAll(driver)) throw `${city} failed`; seeAllUrl = await UIActions.getSeeAllUrl(driver);
} }
await WebDriverUtils.wait(5);
if (seeAllUrl.length == 0) throw `${city} failed`;
await WebDriverUtils.wait();
// open new incognito window
const driver2 = await useChrome();
if (!driver2) throw `${city} failed`;
await WebDriverUtils.wait();
await driver2.get(seeAllUrl);
await WebDriverUtils.wait();
console.log("Switch tab") console.log("Switch tab")
let windows = await driver.getAllWindowHandles(); let windows = await driver.getAllWindowHandles();
@ -74,8 +80,8 @@ async function visitCityPages(): Promise<void> {
} }
console.log("Click See all attractions") console.log("Click See all attractions")
if (!await UIActions.clickSeeAllAttractions(driver)) throw `${city} failed`; if (!await UIActions.getSeeAllAttractionsUrl(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5); await WebDriverUtils.wait();
console.log("Switch tab to Attraction") console.log("Switch tab to Attraction")
windows = await driver.getAllWindowHandles(); windows = await driver.getAllWindowHandles();
@ -90,7 +96,7 @@ async function visitCityPages(): Promise<void> {
// click museum // click museum
console.log("Click Museum link"); console.log("Click Museum link");
if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`; if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5); await WebDriverUtils.wait(driver);
let page = 1; let page = 1;
while (1) { while (1) {
@ -98,12 +104,12 @@ async function visitCityPages(): Promise<void> {
// get list of museums // get list of museums
console.log("Get list of museums"); console.log("Get list of museums");
const museumElms = await UIActions.getMusiums(driver); const museumElms = await UIActions.getMusiums(driver);
await WebDriverUtils.wait(1); await WebDriverUtils.wait(driver);
for (const listItem of museumElms) { for (const listItem of museumElms) {
await listItem.click(); await listItem.click();
await WebDriverUtils.wait(3); await WebDriverUtils.wait(driver);
windows = await driver.getAllWindowHandles(); windows = await driver.getAllWindowHandles();
for (const handle of windows) { for (const handle of windows) {
@ -120,10 +126,10 @@ async function visitCityPages(): Promise<void> {
museumWindow && await driver.switchTo().window(museumWindow); museumWindow && await driver.switchTo().window(museumWindow);
await driver.close(); await driver.close();
await WebDriverUtils.wait(1); await WebDriverUtils.wait(driver);
attactionsWindow && await driver.switchTo().window(attactionsWindow); attactionsWindow && await driver.switchTo().window(attactionsWindow);
await WebDriverUtils.wait(1); await WebDriverUtils.wait(driver);
} }
@ -132,7 +138,7 @@ async function visitCityPages(): Promise<void> {
if (page > 10) break; if (page > 10) break;
UIActions.clickPagination(driver, page); UIActions.clickPagination(driver, page);
await WebDriverUtils.wait(5); await WebDriverUtils.wait(driver);
} }
@ -142,7 +148,7 @@ async function visitCityPages(): Promise<void> {
if (i < cities.length - 1) { if (i < cities.length - 1) {
console.log(`Waiting for 5000 seconds before next city...`); console.log(`Waiting for 5000 seconds before next city...`);
await WebDriverUtils.wait(5); // Wait 5000 seconds before next city await WebDriverUtils.wait(); // Wait 5000 seconds before next city
} }
} catch (error) { } catch (error) {
@ -150,7 +156,7 @@ async function visitCityPages(): Promise<void> {
// If the button is not found within the timeout, log and continue to the next city // If the button is not found within the timeout, log and continue to the next city
console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`); console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`);
await WebDriverUtils.wait(5); // Wait 5 seconds before next city await WebDriverUtils.wait(); // Wait 5 seconds before next city
} }
} }

View File

@ -21,9 +21,9 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
await searchInput.sendKeys(city); await searchInput.sendKeys(city);
// Submit the search (press Enter) // Submit the search (press Enter)
await WebDriverUtils.wait(2); await WebDriverUtils.wait(driver);
await searchInput.sendKeys('\uE007'); // Unicode for Enter key await searchInput.sendKeys('\uE007'); // Unicode for Enter key
await WebDriverUtils.wait(5); // Wait 5 seconds before next city await WebDriverUtils.wait(driver); // Wait 5 seconds before next city
return true; return true;
} catch (e) { } catch (e) {
@ -49,6 +49,21 @@ export async function clickSeeAll(driver: WebDriver): Promise<boolean> {
} }
} }
export async function getSeeAllUrl(driver: WebDriver): Promise<string> {
const xpath = `//h3[normalize-space(.)='Things to do']/ancestor::div[1]//a[starts-with(@href, '/Attractions')]`;
try {
const anchor = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
const url = await anchor.getAttribute('href');
console.log('Found Attractions URL:', url);
return url;
} catch (err) {
console.warn('Could not find the Attractions link:', err);
}
return "";
}
export async function gotoHome(driver: WebDriver): Promise<boolean> { export async function gotoHome(driver: WebDriver): Promise<boolean> {
try { try {
// Click on the Tripadvisor logo before searching for the city // Click on the Tripadvisor logo before searching for the city
@ -82,6 +97,22 @@ export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean
} }
} }
export async function getSeeAllAttractionsUrl(driver: WebDriver): Promise<string | null> {
const xpath = `//h2[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(anchorElement), 5000);
const href = await anchorElement.getAttribute('href');
return href;
} catch (error) {
console.warn('Element not found or href not retrievable.', error);
return null;
}
}
export async function clickMuseumsLink(driver: WebDriver): Promise<boolean> { export async function clickMuseumsLink(driver: WebDriver): Promise<boolean> {
const xpath = `//a[.//*[normalize-space(.)='Museums']]`; const xpath = `//a[.//*[normalize-space(.)='Museums']]`;

View File

@ -2,7 +2,8 @@
* Utility class for common WebDriver operations * Utility class for common WebDriver operations
*/ */
import { WebDriver, By, until } from 'selenium-webdriver'; import { Builder, By, until, WebDriver } from 'selenium-webdriver';
import chrome from 'selenium-webdriver/chrome';
import { writeFileSync, existsSync, appendFileSync } from 'fs'; import { writeFileSync, existsSync, appendFileSync } from 'fs';
import * as path from 'path'; import * as path from 'path';
import { ContactInfo } from './types'; import { ContactInfo } from './types';
@ -13,9 +14,36 @@ export class WebDriverUtils {
* @param seconds Number of seconds to wait * @param seconds Number of seconds to wait
* @returns Promise that resolves after the specified time * @returns Promise that resolves after the specified time
*/ */
static async wait(seconds: number): Promise<void> { static async wait(driver?: WebDriver): Promise<void> {
console.log(`Waiting for ${seconds} seconds...`); const seconds = Math.floor(Math.random() * 1000) % 3 + 3;
return new Promise(resolve => setTimeout(resolve, seconds * 1000)); console.log(`Scrolling to bottom for ${seconds} seconds...`);
const endTime = Date.now() + seconds * 1000;
let scrollCounter = 0;
while (Date.now() < endTime) {
try {
if(driver){
if(scrollCounter < 4){
await driver.executeScript(`
window.scrollBy(0, window.innerHeight);
`);
}else{
await driver.executeScript(`
window.scrollTo(0, 0);
`);
}
scrollCounter++;
}
} catch (error) {
console.warn('Scroll failed:', error);
}
// Wait a little between scrolls
await new Promise(resolve => setTimeout(resolve, 500));
}
} }
/** /**
@ -43,3 +71,97 @@ export function saveContactInfoToCSV(city: string, contactInfo: ContactInfo, fil
console.log(`Contact info saved to ${filePath}`); console.log(`Contact info saved to ${filePath}`);
} }
export async function disableCookiesInChrome(): Promise<WebDriver | null> {
// Set Chrome options
const options = new chrome.Options();
// 1. Block all cookies
//options.setUserPreferences({
// 'profile.default_content_setting_values.cookies': 2, // 2 = Block all
// 'profile.block_third_party_cookies': true
//});
// 2. Optional: Launch in incognito for extra privacy
options.addArguments('--incognito');
options.addArguments('--start-maximized');
let driver: WebDriver | null = null;
try {
driver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
await driver.get('https://www.tripadvisor.com/');
console.log('Chrome launched with cookies disabled.');
// Optional: Verify cookies are blocked by trying to set/get a cookie
await driver.manage().addCookie({ name: 'test', value: '123' });
const cookies = await driver.manage().getCookies();
console.log('Cookies after trying to add:', cookies); // Should be empty or restricted
return driver;
} catch (error) {
console.error('Error:', error);
return driver;
}
}
export async function useChrome(): Promise<WebDriver | null> {
// Set Chrome options
const options = new chrome.Options();
// 1. Block all cookies
//options.setUserPreferences({
// 'profile.default_content_setting_values.cookies': 2, // 2 = Block all
// 'profile.block_third_party_cookies': true
//});
// 2. Optional: Launch in incognito for extra privacy
options.addArguments('--incognito');
options.addArguments('--start-maximized');
let driver: WebDriver | null = null;
try {
driver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
await driver.get('https://www.tripadvisor.com/');
console.log('Chrome launched with cookies disabled.');
// Optional: Verify cookies are blocked by trying to set/get a cookie
await driver.manage().addCookie({ name: 'test', value: '123' });
const cookies = await driver.manage().getCookies();
console.log('Cookies after trying to add:', cookies); // Should be empty or restricted
return driver;
} catch (error) {
console.error('Error:', error);
return driver;
}
}
export async function useExistingChrome(): Promise<WebDriver> {
// Connect to an existing Chrome browser running in debug mode on port 9222
const options = new chrome.Options();
// Set the debugger address to connect to the existing Chrome instance
options.debuggerAddress('localhost:9222');
// Create WebDriver instance that connects to the existing browser
const driver: WebDriver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
return driver;
}