#! /usr/bin/python #-*- coding: utf-8 -*- # Script per effetuare il parsing delle pagine dati INRAN e per # salvare i dati in maniera ordinata in un database SQlite3 # Copyright (C) 2008 Alessandro De Noia # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import re, sqlite3, string from BeautifulSoup import BeautifulSoup from time import time from urllib import urlopen def NormalizeStr(FoodStr): """Restituisce la stringa senza il commento tra parentesi quadre e senza apici Return a new string without the comment in the square brackets and without the apices """ if string.find(FoodStr, "[") == -1: NewFoodStr = FoodStr else: NewFoodStr = FoodStr[0:(string.find(FoodStr, "[")-1)] return string.join(string.split(NewFoodStr,'\''),"\\''") def NormalizeNum(DataNum): """Restituisce un numero float cambiando la virgola in punto o altrimenti cambia la stringa None nel valore NULL Return a float number turning the comma into a dot or else change the string None into the value NULL""" if string.find(DataNum, ",") != -1: NewDataNum = round(float(string.join(string.split(DataNum,","),".")),2) elif (DataNum == "None") or (DataNum == "tr") or (DataNum == " "): NewDataNum = -1.00 else: NewDataNum = round(float(DataNum),2) return NewDataNum def Database(): """Crea, popola e riempie il database con i dati acquisiti Create, populate and fill the database with the acquired data""" data = ['']*22 rows = ('inran_id', 'pe', 'acqua', 'proteine', 'lipidi', 'carboidrati', 'amido', 'zucchero', 'fibra', 'energia_c', 'energia_j', 'sodio', 'potassio', 'ferro', 'calcio', 'fosforo', 'tiamina', 'riboflavina', 'niacina', 'vit_a', 'vit_c', 'vit_e') categories = ('cereali', 'legumi', 'verdure', 'frutta', 'carni_fresche', 'carni_trasformate', 'fast_food', 'frattaglie', 'pesca', 'latte', 'formaggi', 'uova', 'olii', 'dolci', 'vari', 'alcooliche') Nid = 0 conn = sqlite3.connect("inran") cur = conn.cursor() cur.execute("""CREATE table inran (id INTEGER PRIMARY KEY, inran_id FLOAT, categoria VARCHAR NOT NULL, alimento VARCHAR NOT NULL, pe FLOAT DEFAULT 100, acqua FLOAT DEFAULT NULL, proteine FLOAT DEFAULT NULL, lipidi FLOAT DEFAULT NULL, carboidrati FLOAT DEFAULT NULL, amido FLOAT DEFAULT NULL, zucchero FLOAT DEFAULT NULL, fibra FLOAT DEFAULT NULL, energia_c FLOAT DEFAULT NULL, energia_j FLOAT DEFAULT NULL, sodio FLOAT DEFAULT NULL, potassio FLOAT DEFAULT NULL, ferro FLOAT DEFAULT NULL, calcio FLOAT DEFAULT NULL, fosforo FLOAT DEFAULT NULL, tiamina FLOAT DEFAULT NULL, riboflavina FLOAT DEFAULT NULL, niacina FLOAT DEFAULT NULL, vit_a FLOAT DEFAULT NULL, vit_c FLOAT DEFAULT NULL, vit_e FLOAT DEFAULT NULL)""") for category in categories: try: socket = urlopen('http://www.inran.it/servizi_cittadino/per_saperne_di_piu/tabelle_composizione_alimenti/composizione/'+category) except: print("Impossible to resolve the host, please check internet connection") exit() page = BeautifulSoup(socket.read()) socket.close() food = page.findAll('td', {"class":re.compile("descrizione_[1-2]")}) FoodData = page.findAll('td', {"class":re.compile("dato_[1-2]")}) n = 0 for i in food: StrSql = ("""INSERT INTO inran (id, categoria, alimento) VALUES (%d,'%s','%s')""") % (Nid, category, NormalizeStr(i.string)) cur.execute(StrSql) data = FoodData[(22*n):(22*(n+1))] m = 0 for k in data: StrSql2 = ("UPDATE inran SET %s=%f WHERE id=%d") % (rows[m], NormalizeNum(str(k.string)), Nid) cur.execute(StrSql2) m = m + 1 n= n + 1 Nid = Nid + 1 cur.close() conn.commit() conn.close() time1 = time() print("-------------------------------------------------------------") print("Sto creando il database, attendere") print("-------------------------------------------------------------") Database() time2 = time() print("-------------------------------------------------------------") print("Database creato in %f secondi") % (time2 - time1) print("-------------------------------------------------------------")