Commit 37113c67 by Prasad Gaikwad

Initial commit

parent 4e3958c6
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# Use an official Python runtime as a parent image
FROM python:3.10-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# Set work directory
WORKDIR /app
# Install system dependencies required by Playwright's browsers
# Using the combined command to install dependencies for all browsers
# See: https://playwright.dev/docs/docker#install-system-dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
# --- Playwright dependencies ---
libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
# --- Other useful packages ---
curl \
# --- Cleanup ---
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
# Copy the requirements file into the container at /app
COPY requirements.txt setup.py ./
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install -e . --no-deps
# Install Playwright browsers
# This command downloads the browser binaries into the image
RUN playwright install --with-deps
# Copy the rest of the application code into the container at /app
COPY . .
# Expose the port the app runs on
EXPOSE 8001
# Define the command to run the application
# Use 0.0.0.0 to make it accessible from outside the container
CMD ["uvicorn", "gmaps_scraper_server.main_api:app", "--host", "0.0.0.0", "--port", "8001"]
\ No newline at end of file
MIT License
Copyright (c) 2025 conor-is-my-name
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Google Maps Scraper API
A FastAPI service for scraping Google Maps data based on search queries. Ideal for n8n users.
Very high performance, watch out for rate limiting!
Use variables to replace URL parameters
scrape-get?query=hotels%20in%2098392&max_places=100&lang=en&headless=true"
If using n8n or other automation, use the /scrape-get endpoint for it to return results
simple install, copy files and run docker compose up -d
Intened to be used with this n8n build:
https://github.com/conor-is-my-name/n8n-autoscaling
## API Endpoints
### POST `/scrape`
Main scraping endpoint (recommended for production)
**Parameters:**
- `query` (required): Search query (e.g., "hotels in 98392")
- `max_places` (optional): Maximum number of results to return
- `lang` (optional, default "en"): Language code for results
- `headless` (optional, default true): Run browser in headless mode
### GET `/scrape-get`
Alternative GET endpoint with same functionality
### GET `/`
Health check endpoint
## Example Requests
### POST Example
```bash
curl -X POST "http://localhost:8001/scrape" \
-H "Content-Type: application/json" \
-d '{
"query": "hotels in 98392",
"max_places": 10,
"lang": "en",
"headless": true
}'
```
### GET Example
```bash
curl "http://localhost:8001/scrape-get?query=hotels%20in%2098392&max_places=10&lang=en&headless=true"
```
or
```bash
curl "http://gmaps_scraper_api_service:8001/scrape-get?query=hotels%20in%2098392&max_places=10&lang=en&headless=true"
```
## Running the Service
### Docker
```bash
docker-compose up --build
```
### Local Development
1. Install dependencies:
```bash
pip install -r requirements.txt
```
2. Run the API:
```bash
uvicorn gmaps_scraper_server.main_api:app --reload
```
The API will be available at `http://localhost:8001`
or for docker:
`http://gmaps_scraper_api_service:8001`
## Notes
- For production use, consider adding authentication
- The scraping process may take several seconds to minutes depending on the number of results
- Results format depends on the underlying scraper implementation
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
version: '3.8'
services:
scraper-api:
build: . # Build the image from the Dockerfile in the current directory
container_name: gmaps_scraper_api_service # Optional: specify a container name
ports:
- "8001:8001" # Map host port 8001 to container port 8001
restart: unless-stopped # Restart policy
volumes:
- .:/app # Mount current directory to /app in container
working_dir: /app # Set working directory to mounted volume
networks:
- shark
# Optional: Add environment variables if needed for configuration
# environment:
# - HEADLESS_MODE=true
cpu_shares: 1024 # Add cpu_shares here if not using Swarm mode
# Create the external network first with:
# docker network create shark
networks:
shark:
external: true
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "dee123a6-7a41-4224-bdfe-abb707987ce5",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd\n",
"from math import radians, sin, cos, sqrt, atan2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4808fc9e-d455-45f4-8480-0647d880664a",
"metadata": {},
"outputs": [],
"source": [
"28.675908902553893, 77.29237331334664"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "ed8dbc21-bd71-409d-99f7-79bef9d9cf7c",
"metadata": {},
"outputs": [],
"source": [
"url = (\n",
" \"http://127.0.0.1:8000/scrape-get\"\n",
" \"?query=campus%20shoe%20store\"\n",
" \"&max_places=10\"\n",
" \"&lang=en\"\n",
" \"&headless=true\"\n",
" \"&lat=28.675908902553893\"\n",
" \"&lng=77.29237331334664\"\n",
" \"&max_distance_km=30\"\n",
")\n",
"response = requests.get(url)\n",
"data = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "83264d30-c065-4489-8ba1-76ef36d4e589",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('results.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2c862f1b-d00c-4c6a-9b33-453a1d383f70",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['name', 'place_id', 'address', 'rating', 'reviews_count', 'categories',\n",
" 'website', 'phone', 'link', 'latitude', 'longitude'],\n",
" dtype='object')"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f3237992-a408-4f98-bd5c-9d75693866d8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>place_id</th>\n",
" <th>address</th>\n",
" <th>rating</th>\n",
" <th>reviews_count</th>\n",
" <th>categories</th>\n",
" <th>website</th>\n",
" <th>phone</th>\n",
" <th>link</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>distance_km</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7c906e6130791:0xcc05d9ccdb1a6507</td>\n",
" <td>Phoenix Marcketcity Kurla, Shop No- LG - 09, L...</td>\n",
" <td>4.8</td>\n",
" <td>187</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/mum...</td>\n",
" <td>9289690421</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.086596</td>\n",
" <td>72.888560</td>\n",
" <td>135.963652</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91</td>\n",
" <td>Survey No.207, Pheonix Mall pune, Store No, GP...</td>\n",
" <td>4.5</td>\n",
" <td>76</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690432</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.562243</td>\n",
" <td>73.916699</td>\n",
" <td>159.702331</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bddeb069cbc110d:0x17263f831fd89986</td>\n",
" <td>+ SFS 10, City Centre Mall, MF 16A, Untwadi Rd...</td>\n",
" <td>4.9</td>\n",
" <td>195</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/nas...</td>\n",
" <td>9289690426</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.990579</td>\n",
" <td>73.762117</td>\n",
" <td>0.062102</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7bb7fedefaeb5:0x7bd871cb3e0a97c8</td>\n",
" <td>LGF, Mouje, R mall, Manpada, Pot Tukdi and Tal...</td>\n",
" <td>4.9</td>\n",
" <td>161</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/tha...</td>\n",
" <td>9289690431</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.234354</td>\n",
" <td>72.972598</td>\n",
" <td>117.945696</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7b9c605eea545:0xb793f8af46ce5fac</td>\n",
" <td>Shop No. 1&amp;2, Munshi Estate, Plot No 504, MG R...</td>\n",
" <td>4.7</td>\n",
" <td>100</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/mum...</td>\n",
" <td>9289148572</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.173210</td>\n",
" <td>72.955426</td>\n",
" <td>124.115873</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name place_id \\\n",
"0 Campus Exclusive Store 0x3be7c906e6130791:0xcc05d9ccdb1a6507 \n",
"1 Campus Exclusive Store 0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91 \n",
"2 Campus Exclusive Store 0x3bddeb069cbc110d:0x17263f831fd89986 \n",
"3 Campus Exclusive Store 0x3be7bb7fedefaeb5:0x7bd871cb3e0a97c8 \n",
"4 Campus Exclusive Store 0x3be7b9c605eea545:0xb793f8af46ce5fac \n",
"\n",
" address rating reviews_count \\\n",
"0 Phoenix Marcketcity Kurla, Shop No- LG - 09, L... 4.8 187 \n",
"1 Survey No.207, Pheonix Mall pune, Store No, GP... 4.5 76 \n",
"2 + SFS 10, City Centre Mall, MF 16A, Untwadi Rd... 4.9 195 \n",
"3 LGF, Mouje, R mall, Manpada, Pot Tukdi and Tal... 4.9 161 \n",
"4 Shop No. 1&2, Munshi Estate, Plot No 504, MG R... 4.7 100 \n",
"\n",
" categories \\\n",
"0 ['Shoe store', 'Sportswear store', 'Store'] \n",
"1 ['Shoe store', 'Sportswear store', 'Store'] \n",
"2 ['Shoe store', 'Sportswear store', 'Store'] \n",
"3 ['Shoe store', 'Sportswear store', 'Store'] \n",
"4 ['Shoe store', 'Sportswear store', 'Store'] \n",
"\n",
" website phone \\\n",
"0 https://stores.campusshoes.com/maharashtra/mum... 9289690421 \n",
"1 https://stores.campusshoes.com/maharashtra/pun... 9289690432 \n",
"2 https://stores.campusshoes.com/maharashtra/nas... 9289690426 \n",
"3 https://stores.campusshoes.com/maharashtra/tha... 9289690431 \n",
"4 https://stores.campusshoes.com/maharashtra/mum... 9289148572 \n",
"\n",
" link latitude longitude \\\n",
"0 https://www.google.com/maps/place/Campus+Exclu... 19.086596 72.888560 \n",
"1 https://www.google.com/maps/place/Campus+Exclu... 18.562243 73.916699 \n",
"2 https://www.google.com/maps/place/Campus+Exclu... 19.990579 73.762117 \n",
"3 https://www.google.com/maps/place/Campus+Exclu... 19.234354 72.972598 \n",
"4 https://www.google.com/maps/place/Campus+Exclu... 19.173210 72.955426 \n",
"\n",
" distance_km \n",
"0 135.963652 \n",
"1 159.702331 \n",
"2 0.062102 \n",
"3 117.945696 \n",
"4 124.115873 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "5769b6d2-bb76-4239-9de5-f666096359e0",
"metadata": {},
"outputs": [],
"source": [
"ref_lat = 19.991013551968383\n",
"ref_lng = 73.76174367035061\n",
"max_distance_km = 30"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4c1af779-4486-457d-aa5b-189ec6dd6dea",
"metadata": {},
"outputs": [],
"source": [
"def haversine(lat1, lng1, lat2, lng2):\n",
" R = 6371 # Earth radius in kilometers\n",
" d_lat = radians(lat2 - lat1)\n",
" d_lng = radians(lng2 - lng1)\n",
" a = sin(d_lat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(d_lng / 2) ** 2\n",
" c = 2 * atan2(sqrt(a), sqrt(1 - a))\n",
" return R * c"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d4a74da2-5fb9-4040-820c-fd21ec95de15",
"metadata": {},
"outputs": [],
"source": [
"df[\"distance_km\"] = df.apply(\n",
" lambda row: haversine(ref_lat, ref_lng, row[\"latitude\"], row[\"longitude\"]),\n",
" axis=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "dc4e4efc-b324-40e9-8630-f33b250ffd13",
"metadata": {},
"outputs": [],
"source": [
"within_radius = df[df[\"distance_km\"] <= max_distance_km]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "eb594d2d-1009-4fa6-aa96-80734933c008",
"metadata": {},
"outputs": [],
"source": [
"addresses_within_radius = within_radius[\"address\"].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "df8bee5c-9bae-4eeb-8cc8-677ecc126800",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Addresses within 30 km:\n",
"+ SFS 10, City Centre Mall, MF 16A, Untwadi Rd, Lawate Nagar, Lavate Nager, Parijat Nagar, Nashik, Maharashtra 422002\n"
]
}
],
"source": [
"print(\"Addresses within 30 km:\")\n",
"for address in addresses_within_radius:\n",
" print(address)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11fbf932-e06b-42b7-b92d-25a4f7d65e71",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
import json
import re
import csv
import os
def safe_get(data, *keys):
"""
Safely retrieves nested data from a dictionary or list using a sequence of keys/indices.
Returns None if any key/index is not found or if the data structure is invalid.
"""
current = data
for key in keys:
try:
if isinstance(current, list):
if isinstance(key, int) and 0 <= key < len(current):
current = current[key]
else:
# print(f"Index {key} out of bounds or invalid for list.")
return None
elif isinstance(current, dict):
if key in current:
current = current[key]
else:
# print(f"Key {key} not found in dict.")
return None
else:
# print(f"Cannot access key {key} on non-dict/list item: {type(current)}")
return None
except (IndexError, TypeError, KeyError) as e:
# print(f"Error accessing key {key}: {e}")
return None
return current
def extract_initial_json(html_content):
"""
Extracts the JSON string assigned to window.APP_INITIALIZATION_STATE from HTML content.
"""
try:
match = re.search(r';window\.APP_INITIALIZATION_STATE\s*=\s*(.*?);window\.APP_FLAGS', html_content, re.DOTALL)
if match:
json_str = match.group(1)
if json_str.strip().startswith(('[', '{')):
return json_str
else:
print("Extracted content doesn't look like valid JSON start.")
return None
else:
print("APP_INITIALIZATION_STATE pattern not found.")
return None
except Exception as e:
print(f"Error extracting JSON string: {e}")
return None
def parse_json_data(json_str):
"""
Parses the extracted JSON string, handling the nested JSON string if present.
Returns the main data blob (list) or None if parsing fails or structure is unexpected.
"""
if not json_str:
return None
try:
initial_data = json.loads(json_str)
# Check the initial heuristic path [3][6]
if isinstance(initial_data, list) and len(initial_data) > 3 and isinstance(initial_data[3], list) and len(initial_data[3]) > 6:
data_blob_or_str = initial_data[3][6]
# Case 1: It's already the list we expect (older format?)
if isinstance(data_blob_or_str, list):
print("Found expected list structure directly at initial_data[3][6].")
return data_blob_or_str
# Case 2: It's the string containing the actual JSON
elif isinstance(data_blob_or_str, str) and data_blob_or_str.startswith(")]}'\n"):
print("Found string at initial_data[3][6], attempting to parse inner JSON.")
try:
json_str_inner = data_blob_or_str.split(")]}'\n", 1)[1]
actual_data = json.loads(json_str_inner)
# Check if the parsed inner data is a list and has the expected sub-structure at index 6
if isinstance(actual_data, list) and len(actual_data) > 6:
potential_data_blob = safe_get(actual_data, 6)
if isinstance(potential_data_blob, list):
print("Returning data blob found at actual_data[6].")
return potential_data_blob # This is the main data structure
else:
print(f"Data at actual_data[6] is not a list, but {type(potential_data_blob)}. Saving inner data for inspection.")
# Save actual_data for debugging
try:
with open("debug_inner_data.json", "w", encoding="utf-8") as f_inner:
json.dump(actual_data, f_inner, indent=2)
print("...Successfully saved debug_inner_data.json")
except Exception as dump_error_inner:
print(f"Error saving inner debug file: {dump_error_inner}")
return None # Structure mismatch within inner data
else:
print(f"Parsed inner JSON is not a list or too short (len <= 6), type: {type(actual_data)}. Saving inner data for inspection.")
# Save actual_data for debugging
try:
with open("debug_inner_data.json", "w", encoding="utf-8") as f_inner:
json.dump(actual_data, f_inner, indent=2)
print("...Successfully saved debug_inner_data.json")
except Exception as dump_error_inner:
print(f"Error saving inner debug file: {dump_error_inner}")
return None # Inner JSON structure not as expected
except json.JSONDecodeError as e_inner:
print(f"Error decoding inner JSON string: {e_inner}")
return None
except Exception as e_inner_general:
print(f"Unexpected error processing inner JSON string: {e_inner_general}")
return None
# Case 3: Data at [3][6] is neither a list nor the expected string
else:
print(f"Parsed JSON structure unexpected at [3][6]. Expected list or prefixed JSON string, got {type(data_blob_or_str)}.")
# Save initial_data for debugging
print("Attempting to save full structure to debug_initial_data.json...")
try:
with open("debug_initial_data.json", "w", encoding="utf-8") as f:
json.dump(initial_data, f, indent=2)
print("...Successfully saved debug_initial_data.json")
except Exception as dump_error:
print(f"Error saving debug file: {dump_error}")
return None # Unexpected structure at [3][6]
# Case 4: Initial path [3][6] itself wasn't valid
else:
print(f"Initial JSON structure not as expected (list[3][6] path not valid). Type: {type(initial_data)}")
# Save initial_data for debugging
print("Attempting to save unexpected structure to debug_initial_data.json...")
try:
with open("debug_initial_data.json", "w", encoding="utf-8") as f:
json.dump(initial_data, f, indent=2)
print("...Successfully saved debug_initial_data.json")
except Exception as dump_error:
print(f"Error saving debug file: {dump_error}")
return None # Initial structure invalid
except json.JSONDecodeError as e:
print(f"Error decoding initial JSON: {e}")
return None
except Exception as e:
print(f"Unexpected error parsing JSON data: {e}")
return None
# --- Field Extraction Functions (Indices relative to the data_blob returned by parse_json_data) ---
def get_main_name(data):
"""Extracts the main name of the place."""
# Index relative to the data_blob returned by parse_json_data
# Confirmed via debug_inner_data.json: data_blob = actual_data[6], name = data_blob[11]
return safe_get(data, 11)
def get_place_id(data):
"""Extracts the Google Place ID."""
return safe_get(data, 10) # Updated index
def get_gps_coordinates(data):
"""Extracts latitude and longitude."""
lat = safe_get(data, 9, 2)
lon = safe_get(data, 9, 3)
if lat is not None and lon is not None:
return {"latitude": lat, "longitude": lon}
return None
def get_complete_address(data):
"""Extracts structured address components and joins them."""
address_parts = safe_get(data, 2) # Updated index
if isinstance(address_parts, list):
formatted = ", ".join(filter(None, address_parts))
return formatted if formatted else None
return None
def get_rating(data):
"""Extracts the average star rating."""
return safe_get(data, 4, 7)
def get_reviews_count(data):
"""Extracts the total number of reviews."""
return safe_get(data, 4, 8)
def get_website(data):
"""Extracts the primary website link."""
# Index based on debug_inner_data.json structure relative to data_blob (actual_data[6])
return safe_get(data, 7, 0)
def _find_phone_recursively(data_structure):
"""
Recursively searches a nested list/dict structure for a list containing
the phone icon URL followed by the phone number string.
"""
if isinstance(data_structure, list):
# Check if this list matches the pattern [icon_url, phone_string, ...]
if len(data_structure) >= 2 and \
isinstance(data_structure[0], str) and "call_googblue" in data_structure[0] and \
isinstance(data_structure[1], str):
# Found the pattern, assume data_structure[1] is the phone number
phone_number_str = data_structure[1]
standardized_number = re.sub(r'\D', '', phone_number_str)
if standardized_number:
# print(f"Debug: Found phone via recursive search: {standardized_number}")
return standardized_number
# If not the target list, recurse into list elements
for item in data_structure:
found_phone = _find_phone_recursively(item)
if found_phone:
return found_phone
elif isinstance(data_structure, dict):
# Recurse into dictionary values
for key, value in data_structure.items():
found_phone = _find_phone_recursively(value)
if found_phone:
return found_phone
# Base case: not a list/dict or pattern not found in this branch
return None
def get_phone_number(data_blob):
"""
Extracts and standardizes the primary phone number by recursively searching
the data_blob for the phone icon pattern.
"""
# data_blob is the main list structure (e.g., actual_data[6])
found_phone = _find_phone_recursively(data_blob)
if found_phone:
return found_phone
else:
# print("Debug: Phone number pattern not found in data_blob.")
return None
def get_categories(data):
"""Extracts the list of categories/types."""
return safe_get(data, 13)
def get_thumbnail(data):
"""Extracts the main thumbnail image URL."""
# This path might still be relative to the old structure, needs verification
# If data_blob is the list starting at actual_data[6], this index is likely wrong.
# We need to find the thumbnail within the new structure from debug_inner_data.json
# For now, returning None until verified.
# return safe_get(data, 72, 0, 1, 6, 0) # Placeholder index - LIKELY WRONG
# Tentative guess based on debug_inner_data structure (might be in a sublist like [14][0][0][6][0]?)
return safe_get(data, 14, 0, 0, 6, 0) # Tentative guess
# Add more extraction functions here as needed, using the indices
# from omkarcloud/src/extract_data.py as a reference, BUT VERIFYING against debug_inner_data.json
def extract_place_data(html_content):
"""
High-level function to orchestrate extraction from HTML content.
Saves extracted data into 'extracted_places.csv'.
"""
json_str = extract_initial_json(html_content)
if not json_str:
print("Failed to extract JSON string from HTML.")
return None
data_blob = parse_json_data(json_str)
if not data_blob:
print("Failed to parse JSON data or find expected structure.")
return None
print("Parsed data_blob type:", type(data_blob))
if isinstance(data_blob, list):
print("data_blob length:", len(data_blob))
with open("debug_data_blob_per_place.json", "w", encoding="utf-8") as f:
json.dump(data_blob, f, indent=2)
with open("debug_full_place_page.json", "w", encoding="utf-8") as f:
json.dump(data_blob, f, indent=2)
# Extract individual fields
place_details = {
"name": get_main_name(data_blob),
"place_id": get_place_id(data_blob),
"coordinates": get_gps_coordinates(data_blob),
"address": get_complete_address(data_blob),
"rating": get_rating(data_blob),
"reviews_count": get_reviews_count(data_blob),
"categories": get_categories(data_blob),
"website": get_website(data_blob),
"phone": get_phone_number(data_blob),
"thumbnail": get_thumbnail(data_blob),
}
# Flatten coordinates into latitude and longitude for CSV
coords = place_details.pop("coordinates", None)
if coords:
place_details["latitude"] = coords.get("latitude")
place_details["longitude"] = coords.get("longitude")
# Filter out None values
place_details = {k: v for k, v in place_details.items() if v is not None}
# Save to CSV if valid
if place_details:
save_to_csv(place_details, "extracted_places.csv")
return place_details
else:
print("No valid fields found to save.")
return None
def save_to_csv(data: dict, filename: str):
"""Appends a single row dictionary to a CSV file, creating it if it doesn't exist."""
file_exists = os.path.isfile(filename)
with open(filename, mode='a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=data.keys())
if not file_exists:
writer.writeheader()
writer.writerow(data)
print(f"Saved extracted data to {filename}")
# Example usage (for testing):
if __name__ == '__main__':
# Load sample HTML content from a file (replace 'sample_place.html' with your file)
try:
with open('sample_place.html', 'r', encoding='utf-8') as f:
sample_html = f.read()
extracted_info = extract_place_data(sample_html)
if extracted_info:
print("Extracted Place Data:")
print(json.dumps(extracted_info, indent=2))
else:
print("Could not extract data from the sample HTML.")
except FileNotFoundError:
print("Sample HTML file 'sample_place.html' not found. Cannot run example.")
except Exception as e:
print(f"An error occurred during example execution: {e}")
\ No newline at end of file
from fastapi import FastAPI, HTTPException, Query
from typing import Optional, List, Dict, Any
import logging
# Import the scraper function (adjust path if necessary)
try:
from gmaps_scraper_server.scraper import scrape_google_maps
except ImportError:
# Handle case where scraper might be in a different structure later
logging.error("Could not import scrape_google_maps from scraper.py")
# Define a dummy function to allow API to start, but fail on call
def scrape_google_maps(*args, **kwargs):
raise ImportError("Scraper function not available.")
# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = FastAPI(
title="Google Maps Scraper API",
description="API to trigger Google Maps scraping based on a query.",
version="0.1.0",
)
@app.post("/scrape", response_model=List[Dict[str, Any]])
async def run_scrape(
query: str = Query(..., description="The search query for Google Maps (e.g., 'restaurants in New York')"),
max_places: Optional[int] = Query(None, description="Maximum number of places to scrape. Scrapes all found if None."),
lang: str = Query("en", description="Language code for Google Maps results (e.g., 'en', 'es')."),
headless: bool = Query(True, description="Run the browser in headless mode (no UI). Set to false for debugging locally.")
):
"""
Triggers the Google Maps scraping process for the given query.
"""
logging.info(f"Received scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try:
results = await scrape_google_maps(
query=query,
max_places=max_places,
lang=lang,
headless=headless,
lat=lat,
lng=lng
)
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results
except ImportError as e:
logging.error(f"ImportError during scraping for query '{query}': {e}")
raise HTTPException(status_code=500, detail="Server configuration error: Scraper not available.")
except Exception as e:
logging.error(f"An error occurred during scraping for query '{query}': {e}", exc_info=True)
# Consider more specific error handling based on scraper exceptions
raise HTTPException(status_code=500, detail=f"An internal error occurred during scraping: {str(e)}")
@app.get("/scrape-get", response_model=List[Dict[str, Any]])
async def run_scrape_get(
query: str = Query(...),
max_places: Optional[int] = Query(None),
lang: str = Query("en"),
headless: bool = Query(True),
lat: Optional[float] = Query(None),
lng: Optional[float] = Query(None),
max_distance_km: float = Query(30.0, description="Maximum distance in kilometers from (lat, lng)")
):
try:
results = await scrape_google_maps(
query=query,
max_places=max_places,
lang=lang,
headless=headless,
lat=lat,
lng=lng,
max_distance_km=max_distance_km
)
return results
except Exception as e:
logging.error(f"Error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal Server Error")
@app.get("/")
async def read_root():
return {"message": "Google Maps Scraper API is running."}
import json
import asyncio # Changed from time
import re
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError # Changed to async
from urllib.parse import urlencode
from math import radians, sin, cos, sqrt, atan2
# Import the extraction functions from our helper module
from . import extractor
# --- Constants ---
BASE_URL = "https://www.google.com/maps/search/"
DEFAULT_TIMEOUT = 30000 # 30 seconds for navigation and selectors
SCROLL_PAUSE_TIME = 1.5 # Pause between scrolls
MAX_SCROLL_ATTEMPTS_WITHOUT_NEW_LINKS = 5 # Stop scrolling if no new links found after this many scrolls
## Calculate distance
def haversine(lat1, lon1, lat2, lon2):
R = 6371 # Radius of the Earth in kilometers
dlat = radians(lat2 - lat1)
dlon = radians(lon2 - lon1)
a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
return R * c
# --- Helper Functions ---
def create_search_url(query, lang="en", geo_coordinates=None, zoom=12):
"""
Creates a Google Maps search URL centered around specific geo coordinates.
Args:
query (str): Search query (e.g., "campus shoe store").
lang (str): Language code (default: "en").
geo_coordinates (tuple): (latitude, longitude) to center the map.
zoom (int): Zoom level (default: 12).
Returns:
str: Full Google Maps search URL.
"""
params = {'hl': lang}
encoded_query = query.replace(' ', '+')
if geo_coordinates:
lat, lng = geo_coordinates
return f"{BASE_URL}{encoded_query}/@{lat},{lng},{zoom}z?{urlencode(params)}"
return f"{BASE_URL}{encoded_query}?{urlencode(params)}"
# --- Main Scraping Logic ---
async def scrape_google_maps(query, max_places=None, lang="en", headless=True, lat=None, lng=None, max_distance_km=30):
results = []
place_links = set()
scroll_attempts_no_new = 0
search_url = create_search_url(query, lang, geo_coordinates=(lat, lng))
async with async_playwright() as p: # Changed to async
try:
browser = await p.chromium.launch(headless=headless) # Added await
context = await browser.new_context( # Added await
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
java_script_enabled=True,
accept_downloads=False,
# Consider setting viewport, locale, timezone if needed
locale=lang,
)
page = await context.new_page() # Added await
if not page:
await browser.close() # Close browser before raising
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
raise Exception("Failed to create a new browser page (context.new_page() returned None).")
# Removed problematic: await page.set_default_timeout(DEFAULT_TIMEOUT)
# Removed associated debug prints
search_url = create_search_url(query, lang, geo_coordinates=(lat, lng))
print(f"Navigating to search URL: {search_url}")
await page.goto(search_url, wait_until='domcontentloaded') # Added await
await asyncio.sleep(2) # Changed to asyncio.sleep, added await
# --- Handle potential consent forms ---
# This is a common pattern, might need adjustment based on specific consent popups
try:
consent_button_xpath = "//button[.//span[contains(text(), 'Accept all') or contains(text(), 'Reject all')]]"
# Wait briefly for the button to potentially appear
await page.wait_for_selector(consent_button_xpath, state='visible', timeout=5000) # Added await
# Click the "Accept all" or equivalent button if found
# Example: Prioritize "Accept all"
accept_button = await page.query_selector("//button[.//span[contains(text(), 'Accept all')]]") # Added await
if accept_button:
print("Accepting consent form...")
await accept_button.click() # Added await
else:
# Fallback to clicking the first consent button found (might be reject)
print("Clicking first available consent button...")
await page.locator(consent_button_xpath).first.click() # Added await
# Wait for navigation/popup closure
await page.wait_for_load_state('networkidle', timeout=5000) # Added await
except PlaywrightTimeoutError:
print("No consent form detected or timed out waiting.")
except Exception as e:
print(f"Error handling consent form: {e}")
# --- Scrolling and Link Extraction ---
print("Scrolling to load places...")
feed_selector = '[role="feed"]'
try:
await page.wait_for_selector(feed_selector, state='visible', timeout=25000) # Added await
except PlaywrightTimeoutError:
# Check if it's a single result page (maps/place/)
if "/maps/place/" in page.url:
print("Detected single place page.")
place_links.add(page.url)
else:
print(f"Error: Feed element '{feed_selector}' not found. Maybe no results? Taking screenshot.")
await page.screenshot(path='feed_not_found_screenshot.png') # Added await
await browser.close() # Added await
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
return [] # No results or page structure changed
if await page.locator(feed_selector).count() > 0: # Added await
last_height = await page.evaluate(f'document.querySelector(\'{feed_selector}\').scrollHeight') # Added await
while True:
# Scroll down
await page.evaluate(f'document.querySelector(\'{feed_selector}\').scrollTop = document.querySelector(\'{feed_selector}\').scrollHeight') # Added await
await asyncio.sleep(SCROLL_PAUSE_TIME) # Changed to asyncio.sleep, added await
# Extract links after scroll
current_links_list = await page.locator(f'{feed_selector} a[href*="/maps/place/"]').evaluate_all('elements => elements.map(a => a.href)') # Added await
current_links = set(current_links_list)
new_links_found = len(current_links - place_links) > 0
place_links.update(current_links)
print(f"Found {len(place_links)} unique place links so far...")
if max_places is not None and len(place_links) >= max_places:
print(f"Reached max_places limit ({max_places}).")
place_links = set(list(place_links)[:max_places]) # Trim excess links
break
# Check if scroll height has changed
new_height = await page.evaluate(f'document.querySelector(\'{feed_selector}\').scrollHeight') # Added await
if new_height == last_height:
# Check for the "end of results" marker
end_marker_xpath = "//span[contains(text(), \"You've reached the end of the list.\")]"
if await page.locator(end_marker_xpath).count() > 0: # Added await
print("Reached the end of the results list.")
break
else:
# If height didn't change but end marker isn't there, maybe loading issue?
# Increment no-new-links counter
if not new_links_found:
scroll_attempts_no_new += 1
print(f"Scroll height unchanged and no new links. Attempt {scroll_attempts_no_new}/{MAX_SCROLL_ATTEMPTS_WITHOUT_NEW_LINKS}")
if scroll_attempts_no_new >= MAX_SCROLL_ATTEMPTS_WITHOUT_NEW_LINKS:
print("Stopping scroll due to lack of new links.")
break
else:
scroll_attempts_no_new = 0 # Reset if new links were found this cycle
else:
last_height = new_height
scroll_attempts_no_new = 0 # Reset if scroll height changed
# Optional: Add a hard limit on scrolls to prevent infinite loops
# if scroll_count > MAX_SCROLLS: break
# --- Scraping Individual Places ---
print(f"\nScraping details for {len(place_links)} places...")
count = 0
for link in place_links:
count += 1
print(f"Processing link {count}/{len(place_links)}: {link}") # Keep sync print
try:
await page.goto(link, wait_until='domcontentloaded') # Added await
# Wait a bit for dynamic content if needed, or wait for a specific element
# await page.wait_for_load_state('networkidle', timeout=10000) # Or networkidle if needed
html_content = await page.content() # Added await
place_data = extractor.extract_place_data(html_content)
if place_data:
place_data['link'] = link # Add the source link
results.append(place_data)
# print(json.dumps(place_data, indent=2)) # Optional: print data as it's scraped
else:
print(f" - Failed to extract data for: {link}")
# Optionally save the HTML for debugging
# with open(f"error_page_{count}.html", "w", encoding="utf-8") as f:
# f.write(html_content)
except PlaywrightTimeoutError:
print(f" - Timeout navigating to or processing: {link}")
except Exception as e:
print(f" - Error processing {link}: {e}")
await asyncio.sleep(0.5) # Changed to asyncio.sleep, added await
await browser.close() # Added await
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
except PlaywrightTimeoutError:
print(f"Timeout error during scraping process.")
except Exception as e:
print(f"An error occurred during scraping: {e}")
import traceback
traceback.print_exc() # Print detailed traceback for debugging
finally:
# Ensure browser is closed if an error occurred mid-process
if 'browser' in locals() and browser.is_connected(): # Check if browser exists and is connected
await browser.close() # Added await
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
# --- Example Usage ---
# (Example usage block removed as this script is now intended to be imported as a module)
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dee123a6-7a41-4224-bdfe-abb707987ce5",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd\n",
"from math import radians, sin, cos, sqrt, atan2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4808fc9e-d455-45f4-8480-0647d880664a",
"metadata": {},
"outputs": [],
"source": [
"28.61031036251309, 77.33507542317945"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ed8dbc21-bd71-409d-99f7-79bef9d9cf7c",
"metadata": {},
"outputs": [],
"source": [
"url = (\n",
" \"http://127.0.0.1:8000/scrape-get\"\n",
" \"?query=campus%20shoe%20store%20near%20me\"\n",
" \"&max_places=10\"\n",
" \"&lang=en\"\n",
" \"&headless=true\"\n",
" \"&lat=28.61031036251309\"\n",
" \"&lng=77.33507542317945\"\n",
" \"&max_distance_km=30\"\n",
")\n",
"response = requests.get(url)\n",
"data = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "83264d30-c065-4489-8ba1-76ef36d4e589",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('extracted_places.csv')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "2c862f1b-d00c-4c6a-9b33-453a1d383f70",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['name', 'place_id', 'address', 'rating', 'reviews_count', 'categories',\n",
" 'website', 'phone', 'latitude', 'longitude'],\n",
" dtype='object')"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "f3237992-a408-4f98-bd5c-9d75693866d8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>place_id</th>\n",
" <th>address</th>\n",
" <th>rating</th>\n",
" <th>reviews_count</th>\n",
" <th>categories</th>\n",
" <th>website</th>\n",
" <th>phone</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2bfdaf0058f05:0x7fc4225eb0eed720</td>\n",
" <td>Unit No- HGF 23, Higher Gr Floor, Parihar chow...</td>\n",
" <td>4.5</td>\n",
" <td>69</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289148578</td>\n",
" <td>18.561325</td>\n",
" <td>73.806900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7c906e6130791:0xcc05d9ccdb1a6507</td>\n",
" <td>Phoenix Marcketcity Kurla, Shop No- LG - 09, L...</td>\n",
" <td>4.8</td>\n",
" <td>187</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/mum...</td>\n",
" <td>9289690421</td>\n",
" <td>19.086596</td>\n",
" <td>72.888560</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2bb5812ba0af5:0xaec0dd35c89bc775</td>\n",
" <td>UNIT No-1, FLOOR, 02, GRANT STREET, Phase 1, H...</td>\n",
" <td>4.8</td>\n",
" <td>254</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690420</td>\n",
" <td>18.594074</td>\n",
" <td>73.725319</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x390d19cf0e5f8bbb:0xcb29be02985eaa66</td>\n",
" <td>Shop no 15 &amp; 16, near Smile Hotel, near Americ...</td>\n",
" <td>4.9</td>\n",
" <td>165</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/aur...</td>\n",
" <td>9545727188</td>\n",
" <td>19.879879</td>\n",
" <td>75.324109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bdd95ba0b6166d9:0x16b564707449cf2a</td>\n",
" <td>Shop No.9, Gurnani Status, near Indian Oil Pet...</td>\n",
" <td>4.8</td>\n",
" <td>63</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/nas...</td>\n",
" <td>9289925775</td>\n",
" <td>19.903676</td>\n",
" <td>73.831575</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name place_id \\\n",
"0 Campus Exclusive Store 0x3bc2bfdaf0058f05:0x7fc4225eb0eed720 \n",
"1 Campus Exclusive Store 0x3be7c906e6130791:0xcc05d9ccdb1a6507 \n",
"2 Campus Exclusive Store 0x3bc2bb5812ba0af5:0xaec0dd35c89bc775 \n",
"3 Campus Exclusive Store 0x390d19cf0e5f8bbb:0xcb29be02985eaa66 \n",
"4 Campus Exclusive Store 0x3bdd95ba0b6166d9:0x16b564707449cf2a \n",
"\n",
" address rating reviews_count \\\n",
"0 Unit No- HGF 23, Higher Gr Floor, Parihar chow... 4.5 69 \n",
"1 Phoenix Marcketcity Kurla, Shop No- LG - 09, L... 4.8 187 \n",
"2 UNIT No-1, FLOOR, 02, GRANT STREET, Phase 1, H... 4.8 254 \n",
"3 Shop no 15 & 16, near Smile Hotel, near Americ... 4.9 165 \n",
"4 Shop No.9, Gurnani Status, near Indian Oil Pet... 4.8 63 \n",
"\n",
" categories \\\n",
"0 ['Shoe store', 'Sportswear store', 'Store'] \n",
"1 ['Shoe store', 'Sportswear store', 'Store'] \n",
"2 ['Shoe store', 'Sportswear store', 'Store'] \n",
"3 ['Shoe store', 'Sportswear store', 'Store'] \n",
"4 ['Shoe store', 'Sportswear store', 'Store'] \n",
"\n",
" website phone latitude \\\n",
"0 https://stores.campusshoes.com/maharashtra/pun... 9289148578 18.561325 \n",
"1 https://stores.campusshoes.com/maharashtra/mum... 9289690421 19.086596 \n",
"2 https://stores.campusshoes.com/maharashtra/pun... 9289690420 18.594074 \n",
"3 https://stores.campusshoes.com/maharashtra/aur... 9545727188 19.879879 \n",
"4 https://stores.campusshoes.com/maharashtra/nas... 9289925775 19.903676 \n",
"\n",
" longitude \n",
"0 73.806900 \n",
"1 72.888560 \n",
"2 73.725319 \n",
"3 75.324109 \n",
"4 73.831575 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2555401-fce3-4f38-b729-22434f32f3fd",
"metadata": {},
"outputs": [],
"source": [
"28.675908902553893, 77.29237331334664"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "5769b6d2-bb76-4239-9de5-f666096359e0",
"metadata": {},
"outputs": [],
"source": [
"ref_lat = 28.675908902553893\n",
"ref_lng = 77.29237331334664\n",
"max_distance_km = 30"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "4c1af779-4486-457d-aa5b-189ec6dd6dea",
"metadata": {},
"outputs": [],
"source": [
"def haversine(lat1, lng1, lat2, lng2):\n",
" R = 6371 # Earth radius in kilometers\n",
" d_lat = radians(lat2 - lat1)\n",
" d_lng = radians(lng2 - lng1)\n",
" a = sin(d_lat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(d_lng / 2) ** 2\n",
" c = 2 * atan2(sqrt(a), sqrt(1 - a))\n",
" return R * c"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "d4a74da2-5fb9-4040-820c-fd21ec95de15",
"metadata": {},
"outputs": [],
"source": [
"df[\"distance_km\"] = df.apply(\n",
" lambda row: haversine(ref_lat, ref_lng, row[\"latitude\"], row[\"longitude\"]),\n",
" axis=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "dc4e4efc-b324-40e9-8630-f33b250ffd13",
"metadata": {},
"outputs": [],
"source": [
"within_radius = df[df[\"distance_km\"] <= max_distance_km]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "eb594d2d-1009-4fa6-aa96-80734933c008",
"metadata": {},
"outputs": [],
"source": [
"addresses_within_radius = within_radius[\"address\"].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "df8bee5c-9bae-4eeb-8cc8-677ecc126800",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Addresses within 30 km:\n"
]
}
],
"source": [
"print(\"Addresses within 30 km:\")\n",
"for address in addresses_within_radius:\n",
" print(address)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11fbf932-e06b-42b7-b92d-25a4f7d65e71",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
import json import json
import re import re
import csv
import os
def safe_get(data, *keys): def safe_get(data, *keys):
""" """
...@@ -250,6 +252,7 @@ def get_thumbnail(data): ...@@ -250,6 +252,7 @@ def get_thumbnail(data):
def extract_place_data(html_content): def extract_place_data(html_content):
""" """
High-level function to orchestrate extraction from HTML content. High-level function to orchestrate extraction from HTML content.
Saves extracted data into 'extracted_places.csv'.
""" """
json_str = extract_initial_json(html_content) json_str = extract_initial_json(html_content)
if not json_str: if not json_str:
...@@ -261,7 +264,16 @@ def extract_place_data(html_content): ...@@ -261,7 +264,16 @@ def extract_place_data(html_content):
print("Failed to parse JSON data or find expected structure.") print("Failed to parse JSON data or find expected structure.")
return None return None
# Now extract individual fields using the helper functions print("Parsed data_blob type:", type(data_blob))
if isinstance(data_blob, list):
print("data_blob length:", len(data_blob))
with open("debug_data_blob_per_place.json", "w", encoding="utf-8") as f:
json.dump(data_blob, f, indent=2)
with open("debug_full_place_page.json", "w", encoding="utf-8") as f:
json.dump(data_blob, f, indent=2)
# Extract individual fields
place_details = { place_details = {
"name": get_main_name(data_blob), "name": get_main_name(data_blob),
"place_id": get_place_id(data_blob), "place_id": get_place_id(data_blob),
...@@ -271,15 +283,40 @@ def extract_place_data(html_content): ...@@ -271,15 +283,40 @@ def extract_place_data(html_content):
"reviews_count": get_reviews_count(data_blob), "reviews_count": get_reviews_count(data_blob),
"categories": get_categories(data_blob), "categories": get_categories(data_blob),
"website": get_website(data_blob), "website": get_website(data_blob),
"phone": get_phone_number(data_blob), # Needs index verification "phone": get_phone_number(data_blob),
"thumbnail": get_thumbnail(data_blob), # Needs index verification "thumbnail": get_thumbnail(data_blob),
# Add other fields as needed
} }
# Filter out None values if desired # Flatten coordinates into latitude and longitude for CSV
coords = place_details.pop("coordinates", None)
if coords:
place_details["latitude"] = coords.get("latitude")
place_details["longitude"] = coords.get("longitude")
# Filter out None values
place_details = {k: v for k, v in place_details.items() if v is not None} place_details = {k: v for k, v in place_details.items() if v is not None}
return place_details if place_details else None # Save to CSV if valid
if place_details:
save_to_csv(place_details, "extracted_places.csv")
return place_details
else:
print("No valid fields found to save.")
return None
def save_to_csv(data: dict, filename: str):
"""Appends a single row dictionary to a CSV file, creating it if it doesn't exist."""
file_exists = os.path.isfile(filename)
with open(filename, mode='a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=data.keys())
if not file_exists:
writer.writeheader()
writer.writerow(data)
print(f"Saved extracted data to {filename}")
# Example usage (for testing): # Example usage (for testing):
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -33,14 +33,13 @@ async def run_scrape( ...@@ -33,14 +33,13 @@ async def run_scrape(
""" """
logging.info(f"Received scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}") logging.info(f"Received scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try: try:
# Run the potentially long-running scraping task results = await scrape_google_maps(
# Note: For production, consider running this in a background task queue (e.g., Celery)
# to avoid blocking the API server for long durations.
results = await scrape_google_maps( # Added await
query=query, query=query,
max_places=max_places, max_places=max_places,
lang=lang, lang=lang,
headless=headless # Pass headless option from API headless=headless,
lat=lat,
lng=lng
) )
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.") logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results return results
...@@ -54,42 +53,29 @@ async def run_scrape( ...@@ -54,42 +53,29 @@ async def run_scrape(
@app.get("/scrape-get", response_model=List[Dict[str, Any]]) @app.get("/scrape-get", response_model=List[Dict[str, Any]])
async def run_scrape_get( async def run_scrape_get(
query: str = Query(..., description="The search query for Google Maps (e.g., 'restaurants in New York')"), query: str = Query(...),
max_places: Optional[int] = Query(None, description="Maximum number of places to scrape. Scrapes all found if None."), max_places: Optional[int] = Query(None),
lang: str = Query("en", description="Language code for Google Maps results (e.g., 'en', 'es')."), lang: str = Query("en"),
headless: bool = Query(True, description="Run the browser in headless mode (no UI). Set to false for debugging locally.") headless: bool = Query(True),
lat: Optional[float] = Query(None),
lng: Optional[float] = Query(None),
max_distance_km: float = Query(30.0, description="Maximum distance in kilometers from (lat, lng)")
): ):
"""
Triggers the Google Maps scraping process for the given query via GET request.
"""
logging.info(f"Received GET scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try: try:
# Run the potentially long-running scraping task results = await scrape_google_maps(
# Note: For production, consider running this in a background task queue (e.g., Celery)
# to avoid blocking the API server for long durations.
results = await scrape_google_maps( # Added await
query=query, query=query,
max_places=max_places, max_places=max_places,
lang=lang, lang=lang,
headless=headless # Pass headless option from API headless=headless,
lat=lat,
lng=lng,
max_distance_km=max_distance_km
) )
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results return results
except ImportError as e:
logging.error(f"ImportError during scraping for query '{query}': {e}")
raise HTTPException(status_code=500, detail="Server configuration error: Scraper not available.")
except Exception as e: except Exception as e:
logging.error(f"An error occurred during scraping for query '{query}': {e}", exc_info=True) logging.error(f"Error: {e}", exc_info=True)
# Consider more specific error handling based on scraper exceptions raise HTTPException(status_code=500, detail="Internal Server Error")
raise HTTPException(status_code=500, detail=f"An internal error occurred during scraping: {str(e)}")
# Basic root endpoint for health check or info
@app.get("/") @app.get("/")
async def read_root(): async def read_root():
return {"message": "Google Maps Scraper API is running."} return {"message": "Google Maps Scraper API is running."}
# Example for running locally (uvicorn main_api:app --reload)
# if __name__ == "__main__":
# import uvicorn
# uvicorn.run(app, host="0.0.0.0", port=8001)
\ No newline at end of file
...@@ -3,6 +3,7 @@ import asyncio # Changed from time ...@@ -3,6 +3,7 @@ import asyncio # Changed from time
import re import re
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError # Changed to async from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError # Changed to async
from urllib.parse import urlencode from urllib.parse import urlencode
from math import radians, sin, cos, sqrt, atan2
# Import the extraction functions from our helper module # Import the extraction functions from our helper module
from . import extractor from . import extractor
...@@ -13,32 +14,46 @@ DEFAULT_TIMEOUT = 30000 # 30 seconds for navigation and selectors ...@@ -13,32 +14,46 @@ DEFAULT_TIMEOUT = 30000 # 30 seconds for navigation and selectors
SCROLL_PAUSE_TIME = 1.5 # Pause between scrolls SCROLL_PAUSE_TIME = 1.5 # Pause between scrolls
MAX_SCROLL_ATTEMPTS_WITHOUT_NEW_LINKS = 5 # Stop scrolling if no new links found after this many scrolls MAX_SCROLL_ATTEMPTS_WITHOUT_NEW_LINKS = 5 # Stop scrolling if no new links found after this many scrolls
# --- Helper Functions --- ## Calculate distance
def create_search_url(query, lang="en", geo_coordinates=None, zoom=None):
"""Creates a Google Maps search URL."""
params = {'q': query, 'hl': lang}
# Note: geo_coordinates and zoom might require different URL structure (/maps/@lat,lng,zoom)
# For simplicity, starting with basic query search
return BASE_URL + "?" + urlencode(params)
# --- Main Scraping Logic --- def haversine(lat1, lon1, lat2, lon2):
async def scrape_google_maps(query, max_places=None, lang="en", headless=True): # Added async R = 6371 # Radius of the Earth in kilometers
dlat = radians(lat2 - lat1)
dlon = radians(lon2 - lon1)
a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
return R * c
# --- Helper Functions ---
def create_search_url(query, lang="en", geo_coordinates=None, zoom=12):
""" """
Scrapes Google Maps for places based on a query. Creates a Google Maps search URL centered around specific geo coordinates.
Args: Args:
query (str): The search query (e.g., "restaurants in New York"). query (str): Search query (e.g., "campus shoe store").
max_places (int, optional): Maximum number of places to scrape. Defaults to None (scrape all found). lang (str): Language code (default: "en").
lang (str, optional): Language code for Google Maps (e.g., 'en', 'es'). Defaults to "en". geo_coordinates (tuple): (latitude, longitude) to center the map.
headless (bool, optional): Whether to run the browser in headless mode. Defaults to True. zoom (int): Zoom level (default: 12).
Returns: Returns:
list: A list of dictionaries, each containing details for a scraped place. str: Full Google Maps search URL.
Returns an empty list if no places are found or an error occurs.
""" """
params = {'hl': lang}
encoded_query = query.replace(' ', '+')
if geo_coordinates:
lat, lng = geo_coordinates
return f"{BASE_URL}{encoded_query}/@{lat},{lng},{zoom}z?{urlencode(params)}"
return f"{BASE_URL}{encoded_query}?{urlencode(params)}"
# --- Main Scraping Logic ---
async def scrape_google_maps(query, max_places=None, lang="en", headless=True, lat=None, lng=None, max_distance_km=30):
results = [] results = []
place_links = set() place_links = set()
scroll_attempts_no_new = 0 scroll_attempts_no_new = 0
search_url = create_search_url(query, lang, geo_coordinates=(lat, lng))
async with async_playwright() as p: # Changed to async async with async_playwright() as p: # Changed to async
try: try:
...@@ -50,14 +65,24 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True): ...@@ -50,14 +65,24 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
# Consider setting viewport, locale, timezone if needed # Consider setting viewport, locale, timezone if needed
locale=lang, locale=lang,
) )
page = await context.new_page() # Added await page = await context.new_page() # Added await
if not page: if not page:
await browser.close() # Close browser before raising await browser.close() # Close browser before raising
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
raise Exception("Failed to create a new browser page (context.new_page() returned None).") raise Exception("Failed to create a new browser page (context.new_page() returned None).")
# Removed problematic: await page.set_default_timeout(DEFAULT_TIMEOUT) # Removed problematic: await page.set_default_timeout(DEFAULT_TIMEOUT)
# Removed associated debug prints # Removed associated debug prints
search_url = create_search_url(query, lang) search_url = create_search_url(query, lang, geo_coordinates=(lat, lng))
print(f"Navigating to search URL: {search_url}") print(f"Navigating to search URL: {search_url}")
await page.goto(search_url, wait_until='domcontentloaded') # Added await await page.goto(search_url, wait_until='domcontentloaded') # Added await
await asyncio.sleep(2) # Changed to asyncio.sleep, added await await asyncio.sleep(2) # Changed to asyncio.sleep, added await
...@@ -100,6 +125,16 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True): ...@@ -100,6 +125,16 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
print(f"Error: Feed element '{feed_selector}' not found. Maybe no results? Taking screenshot.") print(f"Error: Feed element '{feed_selector}' not found. Maybe no results? Taking screenshot.")
await page.screenshot(path='feed_not_found_screenshot.png') # Added await await page.screenshot(path='feed_not_found_screenshot.png') # Added await
await browser.close() # Added await await browser.close() # Added await
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
return [] # No results or page structure changed return [] # No results or page structure changed
if await page.locator(feed_selector).count() > 0: # Added await if await page.locator(feed_selector).count() > 0: # Added await
...@@ -178,6 +213,15 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True): ...@@ -178,6 +213,15 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
await asyncio.sleep(0.5) # Changed to asyncio.sleep, added await await asyncio.sleep(0.5) # Changed to asyncio.sleep, added await
await browser.close() # Added await await browser.close() # Added await
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
except PlaywrightTimeoutError: except PlaywrightTimeoutError:
print(f"Timeout error during scraping process.") print(f"Timeout error during scraping process.")
...@@ -189,9 +233,15 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True): ...@@ -189,9 +233,15 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
# Ensure browser is closed if an error occurred mid-process # Ensure browser is closed if an error occurred mid-process
if 'browser' in locals() and browser.is_connected(): # Check if browser exists and is connected if 'browser' in locals() and browser.is_connected(): # Check if browser exists and is connected
await browser.close() # Added await await browser.close() # Added await
if lat is not None and lng is not None:
print(f"\nScraping finished. Found details for {len(results)} places.") results = [
return results place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
# --- Example Usage --- # --- Example Usage ---
# (Example usage block removed as this script is now intended to be imported as a module) # (Example usage block removed as this script is now intended to be imported as a module)
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dee123a6-7a41-4224-bdfe-abb707987ce5",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3aed9b86-3149-43f0-b80f-332644d2aef4",
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:8000/scrape-get?query=campus%20stores&max_places=5&lang=en&headless=true&lat=19.99113822646553&lng=73.76191319096492&max_distance_km=30\"\n",
"response = requests.get(url)\n",
"data = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1700a291-3d4d-465d-8351-d7ebf68d6992",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(data)\n",
"\n",
"df[\"latitude\"] = df[\"coordinates\"].apply(lambda x: x[\"latitude\"])\n",
"df[\"longitude\"] = df[\"coordinates\"].apply(lambda x: x[\"longitude\"])\n",
"df.drop(columns=\"coordinates\", inplace=True)\n",
"\n",
"df.to_csv(\"results.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2a79a396-b868-4087-aac3-eeea5efe2363",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('results.csv')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ce5b944c-8e50-4544-8524-ef2dddcc1f24",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>place_id</th>\n",
" <th>address</th>\n",
" <th>rating</th>\n",
" <th>reviews_count</th>\n",
" <th>categories</th>\n",
" <th>website</th>\n",
" <th>phone</th>\n",
" <th>link</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91</td>\n",
" <td>Survey No.207, Pheonix Mall pune, Store No, GP...</td>\n",
" <td>4.5</td>\n",
" <td>76.0</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690432</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.562243</td>\n",
" <td>73.916699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2b99ce8a282a1:0xe1a794aef1ba34ff</td>\n",
" <td>SHOP NO 3, SHAGUN CHOWK, SR.2773/1,PRITAMDAS P...</td>\n",
" <td>4.4</td>\n",
" <td>46.0</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289148580</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.621968</td>\n",
" <td>73.801811</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>My Campus Store</td>\n",
" <td>0x3bc2bf8d7265c9f7:0xd8359f25ee6e29ca</td>\n",
" <td>2, Shridhar Building, Baner Rd, behind Ancient...</td>\n",
" <td>4.8</td>\n",
" <td>78.0</td>\n",
" <td>['Custom t-shirt store', 'E-commerce service',...</td>\n",
" <td>http://www.mycampusstore.in/</td>\n",
" <td>9637066482</td>\n",
" <td>https://www.google.com/maps/place/My+Campus+St...</td>\n",
" <td>18.557775</td>\n",
" <td>73.799953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2ebdb93a63e61:0xb82636ae4fe6e866</td>\n",
" <td>Shop No. 2, Sr, No- 41, Katraj-Dehu Rd Bypass,...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>['Shoe store', 'Sportswear store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289018492</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.451480</td>\n",
" <td>73.848531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x395b93d44507f5e5:0x5aff1ce2a3cb1675</td>\n",
" <td>First floor shop no. 14,333, Domestic Airport ...</td>\n",
" <td>4.9</td>\n",
" <td>35.0</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289925505</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.578019</td>\n",
" <td>73.907157</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name place_id \\\n",
"0 Campus Exclusive Store 0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91 \n",
"1 Campus Exclusive Store 0x3bc2b99ce8a282a1:0xe1a794aef1ba34ff \n",
"2 My Campus Store 0x3bc2bf8d7265c9f7:0xd8359f25ee6e29ca \n",
"3 Campus Exclusive Store 0x3bc2ebdb93a63e61:0xb82636ae4fe6e866 \n",
"4 Campus Exclusive Store 0x395b93d44507f5e5:0x5aff1ce2a3cb1675 \n",
"\n",
" address rating reviews_count \\\n",
"0 Survey No.207, Pheonix Mall pune, Store No, GP... 4.5 76.0 \n",
"1 SHOP NO 3, SHAGUN CHOWK, SR.2773/1,PRITAMDAS P... 4.4 46.0 \n",
"2 2, Shridhar Building, Baner Rd, behind Ancient... 4.8 78.0 \n",
"3 Shop No. 2, Sr, No- 41, Katraj-Dehu Rd Bypass,... NaN NaN \n",
"4 First floor shop no. 14,333, Domestic Airport ... 4.9 35.0 \n",
"\n",
" categories \\\n",
"0 ['Shoe store', 'Sportswear store', 'Store'] \n",
"1 ['Shoe store', 'Sportswear store', 'Store'] \n",
"2 ['Custom t-shirt store', 'E-commerce service',... \n",
"3 ['Shoe store', 'Sportswear store'] \n",
"4 ['Shoe store', 'Sportswear store', 'Store'] \n",
"\n",
" website phone \\\n",
"0 https://stores.campusshoes.com/maharashtra/pun... 9289690432 \n",
"1 https://stores.campusshoes.com/maharashtra/pun... 9289148580 \n",
"2 http://www.mycampusstore.in/ 9637066482 \n",
"3 https://stores.campusshoes.com/maharashtra/pun... 9289018492 \n",
"4 https://stores.campusshoes.com/maharashtra/pun... 9289925505 \n",
"\n",
" link latitude longitude \n",
"0 https://www.google.com/maps/place/Campus+Exclu... 18.562243 73.916699 \n",
"1 https://www.google.com/maps/place/Campus+Exclu... 18.621968 73.801811 \n",
"2 https://www.google.com/maps/place/My+Campus+St... 18.557775 73.799953 \n",
"3 https://www.google.com/maps/place/Campus+Exclu... 18.451480 73.848531 \n",
"4 https://www.google.com/maps/place/Campus+Exclu... 18.578019 73.907157 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7aba79b-ba84-49c3-a1bd-2b4ac257fc74",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dee123a6-7a41-4224-bdfe-abb707987ce5",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3aed9b86-3149-43f0-b80f-332644d2aef4",
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:8000/scrape-get?query=campus%20stores&max_places=5&lang=en&headless=true&lat=19.99113822646553&lng=73.76191319096492&max_distance_km=30\"\n",
"response = requests.get(url)\n",
"data = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1700a291-3d4d-465d-8351-d7ebf68d6992",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(data)\n",
"\n",
"df[\"latitude\"] = df[\"coordinates\"].apply(lambda x: x[\"latitude\"])\n",
"df[\"longitude\"] = df[\"coordinates\"].apply(lambda x: x[\"longitude\"])\n",
"df.drop(columns=\"coordinates\", inplace=True)\n",
"\n",
"df.to_csv(\"results.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2a79a396-b868-4087-aac3-eeea5efe2363",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('results.csv')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ce5b944c-8e50-4544-8524-ef2dddcc1f24",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>place_id</th>\n",
" <th>address</th>\n",
" <th>rating</th>\n",
" <th>reviews_count</th>\n",
" <th>categories</th>\n",
" <th>website</th>\n",
" <th>phone</th>\n",
" <th>link</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91</td>\n",
" <td>Survey No.207, Pheonix Mall pune, Store No, GP...</td>\n",
" <td>4.5</td>\n",
" <td>76</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690432</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.562243</td>\n",
" <td>73.916699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be795198099b6e3:0x81def44cd8764dc6</td>\n",
" <td>First Floor, F12, Metro Junction Mall, Shilpha...</td>\n",
" <td>4.8</td>\n",
" <td>112</td>\n",
" <td>['Shoe store', 'Sportswear store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/tha...</td>\n",
" <td>9289677522</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.228908</td>\n",
" <td>73.123019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7b9c605eea545:0xb793f8af46ce5fac</td>\n",
" <td>Shop No. 1&amp;2, Munshi Estate, Plot No 504, MG R...</td>\n",
" <td>4.7</td>\n",
" <td>100</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/mum...</td>\n",
" <td>9289148572</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.173210</td>\n",
" <td>72.955426</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7c9013c435c23:0x43967767d741332b</td>\n",
" <td>Selection Ahmed Palace, Plot No. 254 SV Road, ...</td>\n",
" <td>4.4</td>\n",
" <td>125</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/mum...</td>\n",
" <td>9289148575</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.060032</td>\n",
" <td>72.836883</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2bb5812ba0af5:0xaec0dd35c89bc775</td>\n",
" <td>UNIT No-1, FLOOR, 02, GRANT STREET, Phase 1, H...</td>\n",
" <td>4.8</td>\n",
" <td>254</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690420</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.594074</td>\n",
" <td>73.725319</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name place_id \\\n",
"0 Campus Exclusive Store 0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91 \n",
"1 Campus Exclusive Store 0x3be795198099b6e3:0x81def44cd8764dc6 \n",
"2 Campus Exclusive Store 0x3be7b9c605eea545:0xb793f8af46ce5fac \n",
"3 Campus Exclusive Store 0x3be7c9013c435c23:0x43967767d741332b \n",
"4 Campus Exclusive Store 0x3bc2bb5812ba0af5:0xaec0dd35c89bc775 \n",
"\n",
" address rating reviews_count \\\n",
"0 Survey No.207, Pheonix Mall pune, Store No, GP... 4.5 76 \n",
"1 First Floor, F12, Metro Junction Mall, Shilpha... 4.8 112 \n",
"2 Shop No. 1&2, Munshi Estate, Plot No 504, MG R... 4.7 100 \n",
"3 Selection Ahmed Palace, Plot No. 254 SV Road, ... 4.4 125 \n",
"4 UNIT No-1, FLOOR, 02, GRANT STREET, Phase 1, H... 4.8 254 \n",
"\n",
" categories \\\n",
"0 ['Shoe store', 'Sportswear store', 'Store'] \n",
"1 ['Shoe store', 'Sportswear store'] \n",
"2 ['Shoe store', 'Sportswear store', 'Store'] \n",
"3 ['Shoe store', 'Sportswear store', 'Store'] \n",
"4 ['Shoe store', 'Sportswear store', 'Store'] \n",
"\n",
" website phone \\\n",
"0 https://stores.campusshoes.com/maharashtra/pun... 9289690432 \n",
"1 https://stores.campusshoes.com/maharashtra/tha... 9289677522 \n",
"2 https://stores.campusshoes.com/maharashtra/mum... 9289148572 \n",
"3 https://stores.campusshoes.com/maharashtra/mum... 9289148575 \n",
"4 https://stores.campusshoes.com/maharashtra/pun... 9289690420 \n",
"\n",
" link latitude longitude \n",
"0 https://www.google.com/maps/place/Campus+Exclu... 18.562243 73.916699 \n",
"1 https://www.google.com/maps/place/Campus+Exclu... 19.228908 73.123019 \n",
"2 https://www.google.com/maps/place/Campus+Exclu... 19.173210 72.955426 \n",
"3 https://www.google.com/maps/place/Campus+Exclu... 19.060032 72.836883 \n",
"4 https://www.google.com/maps/place/Campus+Exclu... 18.594074 73.725319 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7aba79b-ba84-49c3-a1bd-2b4ac257fc74",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
playwright
fastapi
uvicorn[standard]
\ No newline at end of file
from setuptools import setup, find_packages
setup(
name="gmaps_scraper_server",
version="0.1",
packages=find_packages(),
install_requires=[
"playwright",
"fastapi",
"uvicorn[standard]"
],
)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment