initial commit

This commit is contained in:
Ken Yasue
2025-03-23 20:32:57 +01:00
parent c2e18462be
commit 4b42f7bf3a
14 changed files with 1884 additions and 132 deletions

169
.gitignore vendored
View File

@ -1,138 +1,45 @@
# ---> Node # Dependencies
node_modules/
npm-debug.log
yarn-debug.log
yarn-error.log
# Build output
dist/
build/
*.tsbuildinfo
# Environment variables
.env
.env.local
.env.development.local
.env.test.local
.env.production.local
# IDE and editor files
.idea/
.vscode/*
!.vscode/launch.json
!.vscode/tasks.json
!.vscode/settings.json
!.vscode/extensions.json
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
# OS files
.DS_Store
Thumbs.db
# Logs # Logs
logs logs
*.log *.log
npm-debug.log* npm-debug.log*
yarn-debug.log* yarn-debug.log*
yarn-error.log* yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
.cache
# vitepress build output
**/.vitepress/dist
# vitepress cache directory
**/.vitepress/cache
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
# Session data
sessions/*
!sessions/README.md

34
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,34 @@
{
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"skipFiles": [
"<node_internals>/**"
],
"program": "${workspaceFolder}/src/index.ts",
"preLaunchTask": "tsc: build - tsconfig.json",
"outFiles": [
"${workspaceFolder}/dist/**/*.js"
],
"sourceMaps": true
},
{
"type": "node",
"request": "launch",
"name": "Debug TS with ts-node",
"skipFiles": [
"<node_internals>/**"
],
"runtimeExecutable": "node",
"runtimeArgs": [
"--loader",
"ts-node/esm",
"${workspaceFolder}/src/index.ts"
],
"sourceMaps": true
}
]
}

17
.vscode/tasks.json vendored Normal file
View File

@ -0,0 +1,17 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "typescript",
"tsconfig": "tsconfig.json",
"problemMatcher": [
"$tsc"
],
"group": {
"kind": "build",
"isDefault": true
},
"label": "tsc: build - tsconfig.json"
}
]
}

146
README.md
View File

@ -1,2 +1,146 @@
# tripadviser_scraper # TypeScript Selenium Project
A TypeScript project with Selenium WebDriver and VSCode debugging configured. This project demonstrates how to use Selenium to automate browser interactions, specifically opening TripAdvisor.com.
This project is Git-enabled, allowing you to track changes and revert to previous states if needed.
## Project Structure
```
.
├── .vscode/ # VSCode configuration
│ ├── launch.json # Debug configuration
│ └── tasks.json # Build tasks
├── src/ # Source files
│ └── index.ts # Selenium script to open TripAdvisor.com
├── dist/ # Compiled JavaScript files
├── package.json # Project dependencies and scripts
├── tsconfig.json # TypeScript configuration
└── README.md # This file
```
## Dependencies
This project uses the following dependencies:
- **TypeScript**: JavaScript with syntax for types
- **Selenium WebDriver**: Browser automation framework
- **ChromeDriver**: WebDriver for Chrome browser
## Available Scripts
- `npm run build` - Compiles TypeScript to JavaScript
- `npm run start` - Runs the compiled JavaScript (opens TripAdvisor.com in Chrome)
- `npm run dev` - Runs the TypeScript code directly using ts-node (opens TripAdvisor.com in Chrome)
## Selenium WebDriver
The main script (`src/index.ts`) demonstrates:
1. Setting up a Chrome WebDriver instance with session management
2. Navigating to TripAdvisor.com
3. Waiting for the page to load
4. Closing the browser after a delay
### Session Management
This project includes session management capabilities:
- Browser sessions are stored in the `sessions/` directory
- Chrome is configured to use a persistent user data directory
- Sessions persist between runs, allowing for:
- Preserved cookies and login states
- Cached resources for faster loading
- Consistent testing environment
To clear session data and start fresh, you can delete the contents of the `sessions/` directory (except for README.md).
## Debugging in VSCode
This project includes two debug configurations:
1. **Launch Program** - Builds the TypeScript code and then debugs the compiled JavaScript
2. **Debug TS with ts-node** - Directly debugs the TypeScript code using ts-node
Both configurations can be used to debug the Selenium WebDriver script.
To start debugging:
1. Open the Debug view in VSCode (Ctrl+Shift+D or Cmd+Shift+D on macOS)
2. Select the debug configuration you want to use from the dropdown
3. Press F5 or click the green play button
## Adding Breakpoints
1. Click in the gutter next to the line number where you want to add a breakpoint
2. When debugging, execution will pause at the breakpoint
3. You can inspect variables, the call stack, and step through code
## Git Version Control
This project is set up with Git for version control. Here's how to use Git to track and revert changes:
### Viewing Changes
```bash
# See what files have been modified
git status
# See detailed changes in files
git diff
```
### Committing Changes
```bash
# Stage changes for commit
git add .
# Commit changes with a descriptive message
git commit -m "Description of changes"
```
### Reverting Changes
```bash
# Discard changes in working directory for a specific file
git checkout -- <file>
# Discard all changes in working directory
git checkout -- .
# Revert to a specific commit
git reset --hard <commit-hash>
# Undo the last commit but keep the changes
git reset --soft HEAD~1
# Create a new commit that undoes changes from a previous commit
git revert <commit-hash>
```
### Viewing History
```bash
# View commit history
git log
# View commit history with a graph
git log --graph --oneline --all
```
### Branching
```bash
# Create a new branch
git branch <branch-name>
# Switch to a branch
git checkout <branch-name>
# Create and switch to a new branch
git checkout -b <branch-name>
# Merge a branch into the current branch
git merge <branch-name>
```

1137
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

27
package.json Normal file
View File

@ -0,0 +1,27 @@
{
"name": "wenscrapers2",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"build": "tsc",
"start": "node dist/index.js",
"dev": "ts-node src/index.ts",
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"devDependencies": {
"@types/node": "^22.13.10",
"ts-node": "^10.9.2",
"typescript": "^5.8.2"
},
"dependencies": {
"@types/chromedriver": "^81.0.5",
"@types/selenium-webdriver": "^4.1.28",
"chromedriver": "^131.0.0",
"csv-parse": "^5.6.0",
"selenium-webdriver": "^4.29.0"
}
}

29
sessions/README.md Normal file
View File

@ -0,0 +1,29 @@
# Selenium Sessions Directory
This directory is used to store Selenium WebDriver session data, including:
- Browser cookies
- Local storage
- Session storage
- Browser profiles
This allows the Selenium WebDriver to maintain state between runs, which can be useful for:
- Preserving login sessions
- Testing features that require persistent state
- Reducing the need to re-authenticate
- Improving test performance by reusing browser profiles
## Usage
The session data is automatically saved to this directory when the Selenium WebDriver is configured to use it. The data is organized by session ID and can be reused in subsequent runs.
## Maintenance
It's recommended to periodically clean this directory to remove old or unused session data. You can do this by running:
```bash
rm -rf sessions/*
```
Or by selectively removing specific session files that are no longer needed.

164
src/index.ts Normal file
View File

@ -0,0 +1,164 @@
/**
* Selenium WebDriver script to visit TripAdvisor pages for random cities
* & 'C:\Program Files\Google\Chrome\Application\chrome.exe' --remote-debugging-port=9222
*/
import { Builder, By, until, WebDriver } from 'selenium-webdriver';
import * as chromedriver from 'chromedriver';
import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome';
import * as fs from 'fs';
import * as path from 'path';
import { getCities } from './lib/cities';
import { WebDriverUtils, saveContactInfoToCSV } from './lib/utils';
import * as UIActions from './lib/UIActions';
import { randomUUID } from 'crypto';
/**
* Function to visit TripAdvisor pages for each city
*/
async function visitCityPages(): Promise<void> {
const cities = getCities(path.join(__dirname, 'cities.csv'));
console.log('Connecting to existing Chrome browser...');
// Connect to an existing Chrome browser running in debug mode on port 9222
const options = new chrome.Options();
// Set the debugger address to connect to the existing Chrome instance
options.debuggerAddress('localhost:9222');
// Create WebDriver instance that connects to the existing browser
const driver: WebDriver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
// Visit each city's TripAdvisor page
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`[${i + 1}/${cities.length}] Visiting TripAdvisor page for ${city}...`);
let originalWindow;
let cityTopWindow;
let attactionsWindow;
let museumWindow;
try {
const originalWindow = await driver.getWindowHandle();
console.log("Logo click")
if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5);
console.log("Exec Search")
if (!await UIActions.execSearch(driver, city)) throw `${city} failed`;
await WebDriverUtils.wait(5);
console.log("Click See all")
if (!await UIActions.clickSeeAll(driver)) {
if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`;
if (!await UIActions.clickSeeAll(driver)) throw `${city} failed`;
}
await WebDriverUtils.wait(5);
console.log("Switch tab")
let windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow) {
cityTopWindow = handle;
await driver.switchTo().window(handle);
}
}
console.log("Click See all attractions")
if (!await UIActions.clickSeeAllAttractions(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5);
console.log("Switch tab to Attraction")
windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow) {
attactionsWindow = handle;
await driver.switchTo().window(attactionsWindow);
}
}
// click museum
console.log("Click Museum link");
if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`;
await WebDriverUtils.wait(5);
let page = 1;
while (1) {
// get list of museums
console.log("Get list of museums");
const museumElms = await UIActions.getMusiums(driver);
await WebDriverUtils.wait(1);
for (const listItem of museumElms) {
await listItem.click();
await WebDriverUtils.wait(3);
windows = await driver.getAllWindowHandles();
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow && handle !== attactionsWindow) {
museumWindow = handle;
await driver.switchTo().window(museumWindow);
}
}
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
console.log(`${websiteUrl} / ${email}`);
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, 'contact_info.csv'));
museumWindow && await driver.switchTo().window(museumWindow);
await driver.close();
await WebDriverUtils.wait(1);
attactionsWindow && await driver.switchTo().window(attactionsWindow);
await WebDriverUtils.wait(1);
}
page++;
if (page > 10) break;
UIActions.clickPagination(driver, page);
await WebDriverUtils.wait(5);
}
await UIActions.closeAllTabsExceptFirst(driver);
if (i < cities.length - 1) {
console.log(`Waiting for 5000 seconds before next city...`);
await WebDriverUtils.wait(5); // Wait 5000 seconds before next city
}
} catch (error) {
await UIActions.closeAllTabsExceptFirst(driver);
// If the button is not found within the timeout, log and continue to the next city
console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`);
await WebDriverUtils.wait(5); // Wait 5 seconds before next city
}
}
console.log('Finished visiting all cities!');
}
// Run the function
visitCityPages().catch(error => {
console.error('Error in main function:', error);
});

196
src/lib/UIActions.ts Normal file
View File

@ -0,0 +1,196 @@
import { Builder, By, until, WebDriver, WebElement } from 'selenium-webdriver';
import * as chromedriver from 'chromedriver';
import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome';
import * as fs from 'fs';
import * as path from 'path';
import { WebDriverUtils } from './utils';
import { ContactInfo } from './types';
export async function execSearch(driver: WebDriver, city: string): Promise<boolean> {
try {
// Find the search input field
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
await WebDriverUtils.waitForElement(driver, searchSelector, 10000);
console.log("Search box found");
const searchInput = await driver.findElement(By.css(searchSelector));
// Clear any existing text and enter the city name
await searchInput.clear();
await searchInput.sendKeys(city);
// Submit the search (press Enter)
await WebDriverUtils.wait(2);
await searchInput.sendKeys('\uE007'); // Unicode for Enter key
await WebDriverUtils.wait(5); // Wait 5 seconds before next city
return true;
} catch (e) {
return false;
}
}
export async function clickSeeAll(driver: WebDriver): Promise<boolean> {
try {
const seeAllElement = await driver.wait(
until.elementLocated(By.xpath("//span[contains(text(), 'See all')]")),
5000 // Waits up to 5 seconds
);
await driver.wait(until.elementIsVisible(seeAllElement), 5000);
await driver.wait(until.elementIsEnabled(seeAllElement), 5000);
await seeAllElement.click();
return true; // Click succeeded
} catch (error) {
// Element not found or not clickable within the timeout
return false;
}
}
export async function gotoHome(driver: WebDriver): Promise<boolean> {
try {
// Click on the Tripadvisor logo before searching for the city
const logoSelector = 'img.XpHHt[alt="Tripadvisor"]';
const logo = await driver.findElement(By.css(logoSelector));
console.log('Found Tripadvisor logo. Clicking it...');
await logo.click();
return true;
} catch (error) {
// Element not found or not clickable within the timeout
return false;
}
}
export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean> {
const xpath = `//h2[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(anchorElement), 5000);
await driver.wait(until.elementIsEnabled(anchorElement), 5000);
await anchorElement.click();
return true;
} catch (error) {
console.warn('Element not found or not clickable.', error);
return false;
}
}
export async function clickMuseumsLink(driver: WebDriver): Promise<boolean> {
const xpath = `//a[.//*[normalize-space(.)='Museums']]`;
try {
const museumsLink = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(museumsLink), 5000);
await driver.wait(until.elementIsEnabled(museumsLink), 5000);
await museumsLink.click();
return true;
} catch (error) {
console.warn('Museums link not found or not clickable.', error);
return false;
}
}
export async function clickTourismLink(driver: WebDriver): Promise<boolean> {
const xpath = `//a[starts-with(@href, '/Tourism')]`;
try {
const tourismLink = await driver.wait(
until.elementLocated(By.xpath(xpath)),
5000
);
await driver.wait(until.elementIsVisible(tourismLink), 5000);
await driver.wait(until.elementIsEnabled(tourismLink), 5000);
await tourismLink.click();
return true;
} catch (error) {
console.warn('Tourism link not found or not clickable.', error);
return false;
}
}
export async function getMusiums(driver: WebDriver): Promise<WebElement[]> {
const xpath = `//div//section[.//a[starts-with(@href, '/Attraction')] and .//h3]//a[starts-with(@href, '/Attraction') and .//img]`;
try {
const links: WebElement[] = await driver.findElements(By.xpath(xpath));
return links;
} catch (error) {
console.warn('Error clicking attraction links:', error);
return [];
}
}
export async function getWebsiteAndEmail(driver: WebDriver): Promise<ContactInfo> {
const result: ContactInfo = { websiteUrl: null, email: null };
// XPath to find URL (starting with 'http' but not containing 'tripadvisor')
const urlXPath = `//a[starts-with(@href, 'http') and not(contains(@href, 'tripadvisor'))]`;
// XPath to find Email (starting with 'mailto:')
const emailXPath = `//a[starts-with(@href, 'mailto:')]`;
try {
const urlElement = await driver.findElement(By.xpath(urlXPath));
result.websiteUrl = await urlElement.getAttribute('href');
} catch {
console.warn('Website URL not found.');
}
try {
const emailElement = await driver.findElement(By.xpath(emailXPath));
const emailHref = await emailElement.getAttribute('href');
result.email = emailHref.replace('mailto:', '').trim();
} catch {
console.warn('Email address not found.');
}
return result;
}
export async function clickPagination(driver: WebDriver, pageNumber: number): Promise<boolean> {
const xpath = `//a[@aria-label='${pageNumber}']`;
try {
const pageElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(pageElement), 5000);
await driver.wait(until.elementIsEnabled(pageElement), 5000);
await pageElement.click();
return true;
} catch (error) {
console.warn(`Error clicking page number ${pageNumber}:`, error);
return false;
}
}
export async function closeAllTabsExceptFirst(driver: WebDriver): Promise<void> {
const windowHandles = await driver.getAllWindowHandles();
if (windowHandles.length <= 1) {
console.log('Only one tab open; nothing to close.');
return;
}
const originalHandle = windowHandles[0];
for (const handle of windowHandles) {
if (handle !== originalHandle) {
await driver.switchTo().window(handle);
await driver.close();
console.log(`Closed tab: ${handle}`);
}
}
await driver.switchTo().window(originalHandle);
console.log(`Switched back to original tab: ${originalHandle}`);
}

20
src/lib/cities.ts Normal file
View File

@ -0,0 +1,20 @@
/**
* List of cities to visit on TripAdvisor
*/
import fs from 'fs';
import path from 'path';
import { parse } from 'csv-parse/sync';
export function getCities(csvFilePath: string) {
const fileContent = fs.readFileSync(csvFilePath, 'utf-8');
const records = parse(fileContent, {
columns: true,
skip_empty_lines: true
});
const cities: string[] = records.map((record: any) => record['Name of City']);
return cities;
}

4
src/lib/types.ts Normal file
View File

@ -0,0 +1,4 @@
export interface ContactInfo {
websiteUrl: string | null;
email: string | null;
}

45
src/lib/utils.ts Normal file
View File

@ -0,0 +1,45 @@
/**
* Utility class for common WebDriver operations
*/
import { WebDriver, By, until } from 'selenium-webdriver';
import { writeFileSync, existsSync, appendFileSync } from 'fs';
import * as path from 'path';
import { ContactInfo } from './types';
export class WebDriverUtils {
/**
* Wait for a specified number of seconds
* @param seconds Number of seconds to wait
* @returns Promise that resolves after the specified time
*/
static async wait(seconds: number): Promise<void> {
console.log(`Waiting for ${seconds} seconds...`);
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
}
/**
* Wait for an element to be located on the page
* @param driver WebDriver instance
* @param selector CSS selector for the element
* @param timeoutMs Timeout in milliseconds (default: 10000)
* @returns Promise that resolves when the element is found
*/
static async waitForElement(driver: WebDriver, selector: string, timeoutMs: number = 10000) {
console.log(`Waiting for element: ${selector}`);
await driver.wait(until.elementLocated(By.css(selector)), timeoutMs);
}
}
export function saveContactInfoToCSV(city: string, contactInfo: ContactInfo, filePath: string): void {
const headers = 'City,Website URL,Email\n';
const line = `"${city},"${contactInfo.websiteUrl}","${contactInfo.email}"\n`;
if (!existsSync(filePath)) {
writeFileSync(filePath, headers + line);
} else {
appendFileSync(filePath, line);
}
console.log(`Contact info saved to ${filePath}`);
}

8
test.py Normal file
View File

@ -0,0 +1,8 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.debugger_address = "localhost:9222"
driver = webdriver.Chrome(options=options)
driver.get("https://example.com")

20
tsconfig.json Normal file
View File

@ -0,0 +1,20 @@
{
"compilerOptions": {
"target": "ES2016",
"module": "CommonJS",
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"sourceMap": true
},
"include": [
"src/**/*"
],
"exclude": [
"node_modules",
"**/*.spec.ts"
]
}