elixir-amazon-history/lib/amazon_history/scraper.ex

115 lines
3.0 KiB
Elixir
Raw Permalink Normal View History

2018-04-07 11:29:36 +00:00
defmodule AmazonHistory.Scraper do
@base_url "https://www.amazon.co.uk"
@order_url "#{@base_url}/gp/your-account/order-history?opt=ab&digitalOrders=1&unifiedOrders=1&returnTo=&orderFilter=year-"
@end_year DateTime.utc_now.year
@orders_per_page 10
require Logger
use Hound.Helpers
def fetch(email, password, start_year) do
Hound.start_session()
login(email, password)
Logger.info("Logged in successfully. Scraping with start_year: #{start_year}")
items = Enum.reduce(start_year..@end_year, [], fn year, acc ->
acc ++ process_year(year, 0, [])
end)
Hound.end_session()
items
end
defp login(email, password) do
navigate_to(@base_url)
find_element(:id, "nav-link-yourAccount")
|> click
find_element(:id, "ap_email")
|> fill_field(email)
find_element(:id, "continue")
|> click
find_element(:id, "ap_password")
|> fill_field(password)
find_element(:id, "signInSubmit")
|> click
end
defp process_year(year, startIndex, items) do
order_url(year, startIndex)
|> navigate_to
find_element(:id, "ordersContainer")
|> find_all_within_element(:class, "order")
|> process_order_elements(year, startIndex, items)
end
defp order_url(year, startIndex) do
@order_url <> to_string(year) <> "&startIndex=" <> to_string(startIndex)
end
defp process_order_elements([], _year, _startIndex, items) do
items
end
defp process_order_elements(order_elements, year, startIndex, items) do
new_items = Enum.reduce(order_elements, [], fn order_element, acc ->
acc ++ process_order_element(order_element)
end)
process_year(year, startIndex + @orders_per_page, items ++ new_items)
end
defp process_order_element(order_element) do
order_placed = extract_order_placed(order_element)
order_number = extract_order_number(order_element)
total = extract_total(order_element)
find_all_within_element(order_element, :class, "a-link-normal")
|> Enum.map(&(to_row(&1, order_placed, order_number, total)))
|> Enum.filter(&is_valid_row?/1)
end
defp extract_order_placed(order_element) do
find_within_element(order_element, :xpath, "//span[@class='a-color-secondary value']")
|> inner_text
end
defp extract_order_number(order_element) do
find_within_element(order_element, :xpath, "(//span[@class='a-color-secondary value'])[3]")
|> inner_text
end
defp extract_total(order_element) do
order_element
|> search_within_element(:class, "a-color-price", 1)
|> total_or_empty_string
end
defp total_or_empty_string({:ok, element}) do
inner_text(element)
end
defp total_or_empty_string({:error, _}) do
""
end
defp to_row(item_element, order_placed, order_number, total) do
name = inner_text(item_element)
url = attribute_value(item_element, :href)
[ order_number, order_placed, name, total, url ]
end
defp is_valid_row?([_order_number, _order_placed, name, _total, url]) do
String.trim(name) != "" && String.contains?(url, "gp/product")
end
end