#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys, os, re, time
from urllib2 import HTTPError
from mechanize import LinkNotFoundError
from BeautifulSoup import BeautifulSoup
from models import Shoe, ShoeImage
import mechanize

def unique(s):
	"""Return a list of the elements in s, but without duplicates.

	For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3],
	unique("abcabc") some permutation of ["a", "b", "c"], and
	unique(([1, 2], [2, 3], [1, 2])) some permutation of
	[[2, 3], [1, 2]].

	For best speed, all sequence elements should be hashable.  Then
	unique() will usually work in linear time.

	If not possible, the sequence elements should enjoy a total
	ordering, and if list(s).sort() doesn't raise TypeError it's
	assumed that they do enjoy a total ordering.  Then unique() will
	usually work in O(N*log2(N)) time.

	If that's not possible either, the sequence elements must support
	equality-testing.  Then unique() will usually work in quadratic
	time.
	"""

	n = len(s)
	if n == 0:
		return []

	# Try using a dict first, as that's the fastest and will usually
	# work.  If it doesn't work, it will usually fail quickly, so it
	# usually doesn't cost much to *try* it.  It requires that all the
	# sequence elements be hashable, and support equality comparison.
	u = {}
	try:
		for x in s:
			u[x] = 1
	except TypeError:
		del u  # move on to the next method
	else:
		return u.keys()

	# We can't hash all the elements.  Second fastest is to sort,
	# which brings the equal elements together; then duplicates are
	# easy to weed out in a single pass.
	# NOTE:  Python's list.sort() was designed to be efficient in the
	# presence of many duplicate elements.  This isn't true of all
	# sort functions in all languages or libraries, so this approach
	# is more effective in Python than it may be elsewhere.
	try:
		t = list(s)
		t.sort()
	except TypeError:
		del t  # move on to the next method
	
	else:
		assert n > 0
		last = t[0]
		lasti = i = 1
		while i < n:
			if t[i] != last:
				t[lasti] = last = t[i]
				lasti += 1
			i += 1
		return t[:lasti]

	# Brute force is all that's left.
	u = []
	for x in s:
		if x not in u:
			u.append(x)
	return u

def download_product_image(image_dir, url, id, suffix):
	browser = mechanize.Browser()
	browser.set_handle_robots(False)
	try:
		image_data = browser.open(url).get_data()
	except HTTPError, e:
		print 'File not found on server'
		return ''
		
	filename = id + "-" + suffix + ".jpg"
	print 'Saving Data..............'
	f = os.open(image_dir+"shoe/"+filename, os.O_WRONLY)
	os.write(f, image_data)
	os.close(f)
	
	print 'Data Saved to ', filename, ' - ' , os.stat(image_dir+"shoe/"+filename).st_size, "bytes"
	return filename

def download_all_product_images(image_dir):		
			
		for shoe in Shoe.objects.all().iterator():
			#if str(shoe.medium_image.image) is '':
			print ".......Downloading medium image @", shoe.medium_image
			shoe.medium_image.image = download_product_image(image_dir, shoe.medium_image.image_url, str(shoe.id), 'm')
			#shoe.medium_image.save()
	
		for shoe in Shoe.objects.all().iterator():
			#if str(shoe.big_image.image) is '':
			print ".......Downloading big image @", shoe.big_image
			shoe.big_image.image = download_product_image(image_dir, shoe.big_image.image_url, str(shoe.id), 'b')
			#shoe.big_image.save()

			#count = 1
			#for other_image in shoe.other_images.all().iterator():
				#if str(other_image.image) is '':
			#	print ".......Downloading image ", count, " @ ", other_image
			#	other_image.image = download_product_image(image_dir, other_image.image_url, str(shoe.id), str(count))
			#	other_image.save()
			#	count = count + 1	
			
		print "End Function"

def download_product_info(url, image_dir):
	
	print "Downloading Product from", url, "........"
	shoe = Shoe()
	shoe.url = url
	
	browser = mechanize.Browser()
	browser.set_handle_robots(False)
	soup = BeautifulSoup(''.join(browser.open(url).get_data()))
	
	print "Getting Name............."
	
	#Get Name:
	name_soup = soup.find("div", {"id" : re.compile("main1"), "class" : "main1"})
	shoe.name = name_soup.strong.renderContents()

	#Filter Main Heaer
	desc_soup = soup.find("div", {"id" : "main2-left"})

	print "Getting Id............."
	#Get Id
	id_soup = desc_soup.find("input", {"name" : "good_id"})
	shoe.website_id = id_soup["value"]
	
	print "Comparing Id..........."
	try:
		Shoe.objects.get(website_id=shoe.website_id)
		print "Product already in database"
		return
	except:
		print "Updating product........"
		pass
	
	#Get Original Price
	ori_price_soup = desc_soup.find("input", {"name" : "price"})
	shoe.ori_price = ori_price_soup["value"]
	
	#Get Discount Price
	discount_soup = desc_soup.find("input", {"name" : "discount"})
	shoe.discount = 100 - 100*( float(discount_soup["value"]) / 10)		
	
	#Filter Main Heaer
	desc_soup = soup.find("div", {"id" : "main2-center"})	
	
	#Get Sale Price
	sale_price_soup = desc_soup.find("span", {"class" : "main2-center-jg"})
	shoe.sale_price = sale_price_soup.renderContents()
	print shoe.sale_price
	
	#Get Lowest Shipping Price
	shipping_soup = desc_soup.ul.contents[3]
	shoe.shipping_text = shipping_soup.renderContents()
	
	#Filter Main Header
	desc_soup = soup.find("div", {"id" : "main2-right-top"})	
	
	#Get Brand Price
	brand_soup = desc_soup.contents[1].contents[3].find("b")
	shoe.brand = brand_soup.renderContents()

	#Get Info
	shoe.info = str(soup)

	#Filter Main Heaer
	desc_soup = soup.find("div", {"id" : "main2-left"})
	
	#Get Medium Image Url:
	medium_image_soup = desc_soup.find("img")
	medium_image = ShoeImage()
	medium_image.image_url = medium_image_soup["src"]
	medium_image.save()
	shoe.medium_image = medium_image

	big_image_soup = BeautifulSoup(''.join(browser.open('http://www.paixie.net/view_all_direction.php?id='+shoe.website_id).get_data()))
	p = re.compile('http://[\S]+.jpg');
	big_image_array = p.findall(str(big_image_soup.script.contents))
	
	big_image = ShoeImage()
	big_image.image_url = big_image_array[0]
	big_image.save()
	shoe.big_image = big_image
	shoe.save()
	
	#medium_image.image = download_product_image(image_dir, medium_image.image_url, str(shoe.id), 'm')
	#medium_image.save()
	#big_image.image = download_product_image(image_dir, big_image.image_url, str(shoe.id), 'b')
	#big_image.save()
	
	for i in range(1, len(big_image_array)):
		other_image = ShoeImage()
		other_image.image_url = big_image_array[i]
		try:
			#other_image.image = download_product_image(image_dir, other_image.image_url, str(shoe.id), str(i))
			other_image.save()
			shoe.other_images.add(other_image)
		except:
			pass
	
	
def download_all_info(image_dir):
	browser = mechanize.Browser()
	browser.set_handle_robots(False)
	try:
		browser.open("http://www.paixie.net")

	except HTTPError, e:
		pass

	cat_strings = ('女鞋','童鞋','男鞋',)
	
	for cat_string in cat_strings:
		category = unicode(cat_string, 'utf-8').encode('gb2312')
		browser.follow_link( text_regex=re.compile(category) )
		
		has_more_pages = True
		page_number = 1
		index_page_url = browser.geturl()
		while has_more_pages:
			print "Page Number: ", page_number
			urls = [ link.absolute_url for link in browser.links(url_regex=re.compile(r"viewInfoShoes.php?"))]
			unique_links = unique(urls)
			print "Found", len(unique_links), "products to be downloaded"
			
			for unique_link in unique_links:
				#try:
					download_product_info( unique_link,  image_dir)
				#except:
					#pass

			last_page = unicode('下一页', 'utf-8').encode('gb2312')
			
			
			while 1:
				try:
					print "Following Next Lnk......"
					browser.follow_link( text_regex=re.compile(last_page)  )
					print "Link Followed Through"
					index_page_url = browser.geturl()
					break;			
					
				except LinkNotFoundError, e:	
					print "Reached the end of the Section"
					has_more_pages = False
					print "Ended Link"
					break;
					
				except:
					print "An error has occured....Waiting 30 secs to retry"
					time.sleep(30)
					browser.open(index_page_url)
				
			page_number = page_number + 1

def download_all_info_from_page(image_dir):
	browser = mechanize.Browser()
	browser.set_handle_robots(False)
	try:
		browser.open("http://www.paixie.net/showShoes.php?class=1&page=34")

	except HTTPError, e:
		pass

	#cat_strings = ('女鞋','童鞋','男鞋',)
	
	#for cat_string in cat_strings:
		#category = unicode(cat_string, 'utf-8').encode('gb2312')
		#browser.follow_link( text_regex=re.compile(category) )
		
	has_more_pages = True
	page_number = 1
	index_page_url = browser.geturl()
	while has_more_pages:
		print "Page Number: ", page_number
		urls = [ link.absolute_url for link in browser.links(url_regex=re.compile(r"viewInfoShoes.php?"))]
		unique_links = unique(urls)
		print "Found", len(unique_links), "products to be downloaded"
		
		for unique_link in unique_links:
			try:
				download_product_info( unique_link,  image_dir)
			except:
				print "!!!!!! Download failed !!!!!!!"
				pass

		last_page = unicode('下一页', 'utf-8').encode('gb2312')
		
		
		while 1:
			try:
				print "Following Next Lnk......"
				browser.follow_link( text_regex=re.compile(last_page)  )
				print "Link Followed Through"
				index_page_url = browser.geturl()
				break;			
				
			except LinkNotFoundError, e:	
				print "Reached the end of the Section"
				has_more_pages = False
				print "Ended Link"
				break;
				
			except:
				print "An error has occured....Waiting 30 secs to retry"
				time.sleep(30)
				browser.open(index_page_url)
			
		page_number = page_number + 1



#download_all_info('/home/chris/Development/Internet/bbxiespider/media/')
#download_all_info_from_page('/home/chris/Development/Internet/bbxiespider/media/')
download_all_product_images('/home/chris/Development/Internet/bbxiespider/media/')

#download_product_info('http://www.paixie.net/viewInfoShoes.php?id=1702&class=2', '/home/chris/Development/Internet/bbxiespider/media/')

