<wwwconf>
 <days>
<day><id>sun</id><name>Sunday, May 6, 2007</name><desc>Pre-conference W3C AC Meeting</desc></day><day><id>mon</id><name>Monday, May 7, 2007</name><desc>Pre-conference W3C AC Meeting and More</desc></day><day><id>tue</id><name>Tuesday, May 8, 2007</name><desc>Building the Web</desc></day><day><id>wed</id><name>Wednesday, May 9, 2007</name><desc>The Global Web</desc></day><day><id>thu</id><name>Thursday, May 10, 2007</name><desc>Mining the Web</desc></day><day><id>fri</id><name>Friday, May 11, 2007</name><desc>Weaving the Web</desc></day><day><id>sat</id><name>Saturday, May 12, 2007</name><desc>The World Wide Web and You</desc></day> </days>
 <slots>
<slot><id>am1</id><desc>8:30am to 10:00am</desc></slot><slot><id>break1</id><desc>break</desc></slot><slot><id>am2</id><desc>10:30am to 12 noon</desc></slot><slot><id>lunch</id><desc>lunch break</desc></slot><slot><id>pm1</id><desc>1:30pm to 3:00pm</desc></slot><slot><id>break2</id><desc>break</desc></slot><slot><id>pm2</id><desc>3:30pm to 5:00pm</desc></slot><slot><id>pm3</id><desc>5:00pm to 5:30pm</desc></slot><slot><id>evening</id><desc>Evening</desc></slot> </slots>
 <locations>
<location><id>HM1</id><desc>Hotel, Mezzanine 1</desc></location><location><id>HM2</id><desc>Hotel, Mezzanine 2</desc></location><location><id>CC1</id><desc>Conference Centre, Level 1</desc></location><location><id>CC2</id><desc>Conference Centre, Level 2</desc></location> </locations>
 <rooms>
<room><id>vanhorne</id><name>VanHorne</name><location>CC2</location><format>hybrid</format><capacity>832</capacity></room><room><id>shaughn</id><name>Shaughnessy</name><location>CC2</location><format>classroom</format><capacity>90</capacity></room><room><id>beatty</id><name>Beatty</name><location>CC2</location><format>classroom</format><capacity>90</capacity></room><room><id>coleman</id><name>Coleman</name><location>CC2</location><format>classroom</format><capacity>90</capacity></room><room><id>theatre</id><name>Theatre</name><location>CC1</location><format>theatre</format><capacity>252</capacity></room><room><id>mtstephen</id><name>Mt Stephen</name><location>HM1</location><format>rounds</format><capacity>150</capacity></room><room><id>petrak</id><name>Petrak</name><location>HM2</location><format>rounds</format><capacity>45</capacity></room><room><id>newbrunswick</id><name>New Brunswick</name><location>HM2</location><format>classroom</format><capacity>112</capacity></room><room><id>alberta</id><name>Alberta</name><location>HM2</location><format>classroom</format><capacity>176</capacity></room><room><id>cascade</id><name>Cascade</name><location>HM2</location><format>classroom</format><capacity>300</capacity></room><room><id>oak</id><name>Oak</name><location>HM1</location><format>?</format><capacity>32-50</capacity></room><room><id>norquay</id><name>Norquay</name><location>HM1</location><format>?</format><capacity>32-50</capacity></room><room><id>frontenac</id><name>Frontenac</name><location>HM1</location><format>?</format><capacity>24-40</capacity></room><room><id>champlain</id><name>Champlain</name><location>HM1</location><format>?</format><capacity>40-50</capacity></room><room><id>mckenzie</id><name>McKenzie</name><location>HM1</location><format>?</format><capacity>20-40</capacity></room><room><id>palliser</id><name>Palliser</name><location>HM1</location><format>?</format><capacity>24-40</capacity></room><room><id>empress</id><name>Empress</name><location>HM1</location><format>?</format><capacity>16-28</capacity></room><room><id>lacombe</id><name>Lacombe</name><location>HM1</location><format>?</format><capacity>12-28</capacity></room><room><id>alhambra</id><name>Alhambra</name><location>HM2</location><format>rounds</format><capacity>200</capacity></room><room><id>grizzly</id><name>Grizzly House</name><location>Offsite</location><format>restaurant</format><capacity>60</capacity></room> </rooms>
 <tracks>
<track><id>W3CAC</id><name>W3C AC Meeting</name><colour>Yellow</colour></track><track><id>W4A</id><name>W4A Conference</name><colour>Aquamarine</colour></track><track><id>wow</id><name>World Organization of Webmasters</name><colour>Orange</colour></track><track><id>tutorial</id><name>Tutorials</name><colour>Pink</colour></track><track><id>workshop</id><name>Workshops</name><colour>LightCoral</colour></track><track><id>plenary</id><name>Plenaries</name><colour>Aqua</colour></track><track><id>paper</id><name>Refereed Papers</name><colour>LightSkyBlue</colour></track><track><id>panel</id><name>Panels</name><colour>Turquoise</colour></track><track><id>posters</id><name>Posters</name><colour>LightCyan</colour></track><track><id>sponsored</id><name>Sponsored Talks</name><colour>PaleGoldenrod</colour></track><track><id>devel</id><name>Developers Track</name><colour>MediumSpringGreen</colour></track><track><id>W3C</id><name>W3C Track</name><colour>Salmon</colour></track><track><id>history</id><name>Web History Track</name><colour>MediumSlateBlue</colour></track><track><id>museum</id><name>Web History Display</name><colour>Silver</colour></track><track><id>exhibition</id><name>Sponsor Exhibition</name><colour>Violet</colour></track><track><id>bof</id><name>BOFs</name><colour>Fuchsia</colour></track><track><id>misc</id><name>Miscellaneous</name><colour>WhiteSmoke</colour></track> </tracks>
 <events>
<event><room>mtstephen</room><day>sun</day><track>W3CAC</track><url></url><desc>6:30pm to 9:30pm</desc><wide>0</wide><name>W3C AC Reception</name><id></id><type></type><slots><slot>evening</slot></slots></event><event><room>cascade</room><day>sun</day><track>W3CAC</track><url></url><desc>Orientation for new members</desc><wide>0</wide><name>W3C AC Meeting</name><id></id><type></type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>alhambra</room><day>mon</day><track>W3CAC</track><url></url><desc></desc><wide>0</wide><name>W3C AC Lunch</name><id></id><type></type><slots><slot>lunch</slot></slots></event><event><room>cascade</room><day>mon</day><track>W3CAC</track><url></url><desc>(Day 1 of 2)</desc><wide>0</wide><name>W3C AC Meeting</name><id></id><type></type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>alberta</room><day>mon</day><track>wow</track><url>http://www2007.org/wow.php</url><desc>Current Best Practices in Web Development and Design Tutorial (with WOW Web Professional Certification Exam Option)</desc><wide>0</wide><name>World Organization of Webmasters (WOW)</name><id></id><type></type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>alberta</room><day>mon</day><track>wow</track><url></url><desc>(Petrak or Riverview Lounge)</desc><wide>0</wide><name>WOW Lunch</name><id></id><type></type><slots><slot>lunch</slot></slots></event><event><room>newbrunswick</room><day>mon</day><track>W4A</track><url>http://www.w4a.info/</url><desc>(Day 1 of 2)</desc><wide>0</wide><name>Web Accessibility (W4A) Conference</name><id></id><type></type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>newbrunswick</room><day>mon</day><track>W4A</track><url></url><desc></desc><wide>0</wide><name>W4A Lunch</name><id></id><type></type><slots><slot>lunch</slot></slots></event><event><room>grizzly</room><day>mon</day><track>W4A</track><url></url><desc>Offsite at Grizzly House, 6:00pm to 8:00pm</desc><wide>0</wide><name>W4A Banquet</name><id></id><type></type><slots><slot>evening</slot></slots></event><event><room>empress</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T0.php</url><desc>(Full-day tutorial)</desc><wide>0</wide><name>Robust Management of Web and Grid Ontologies and Metadata</name><id>T0</id><type>tutorial</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>shaughn</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W1.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>AIRWeb 2007: Adversarial Information Retrieval on the Web</name><id>W1</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>frontenac</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W2.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>Health Care and Life Sciences Data Integration for the Semantic Web</name><id>W2</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>mckenzie</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W3.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>I3: Identity, Identifiers, Identifications -- Entity-Centric Approaches to Information and Knowledge Management on the Web</name><id>W3</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>newbrunswick</room><day>tue</day><track>W4A</track><url>http://www.w4a.info/</url><desc>(Day 2 of 2)</desc><wide>0</wide><name>Web Accessibility (W4A) Conference</name><id></id><type></type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>palliser</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W5.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>MobEA V: Mobile Web in the Developing World</name><id>W5</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>cascade</room><day>tue</day><track>W3CAC</track><url></url><desc>(Day 2 of 2)</desc><wide>0</wide><name>W3C AC Meeting</name><id></id><type></type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>beatty</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W6.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>Query Log Analysis: Social and Technological Challenges</name><id>W6</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>alberta</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W7.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>Social and Collaborative Construction of Structured Knowledge</name><id>W7</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>champlain</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W8.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>Sponsored Search Auctions</name><id>W9</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>coleman</room><day>tue</day><track>workshop</track><url>http://www2007.org/workshop-W9.php</url><desc>(Full-day workshop)</desc><wide>0</wide><name>Tagging and Metadata for Social Information Organization</name><id>W8</id><type>workshop</type><slots><slot>am1</slot><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>theatre</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T1.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Foundations and Challenges of Web Advertising</name><id>T1</id><type>tutorial</type><slots><slot>am1</slot><slot>am2</slot></slots></event><event><room>theatre</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T5.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Learning to Rank in Vector Spaces and Social Networks</name><id>T5</id><type>tutorial</type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>oak</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T2.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>User-Centric Identity for Web Applications</name><id>T2</id><type>tutorial</type><slots><slot>am1</slot><slot>am2</slot></slots></event><event><room>oak</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T6.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>An Integrated Approach to Evaluating Web Accessibility: Automated, Manual and User-based Testing</name><id>T6</id><type>tutorial</type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>lacombe</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T3.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Cost-Effective Engineering of Web Application Product Lines (Reuse Beyond Components: Exploiting Similarity Patterns)</name><id>T3</id><type>tutorial</type><slots><slot>am1</slot><slot>am2</slot></slots></event><event><room>norquay</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T7.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Deploying Web-scale Mash-ups by Linking Microformats and the Semantic Web</name><id>T7</id><type>tutorial</type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>norquay</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T4.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Next Generation Web Services in Practice</name><id>T4</id><type>tutorial</type><slots><slot>am1</slot><slot>am2</slot></slots></event><event><room>lacombe</room><day>tue</day><track>tutorial</track><url>http://www2007.org/tutorial-T8.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Web Workloads: Characterization, Modeling and Application</name><id>T8</id><type>tutorial</type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>alberta</room><day>tue</day><track>history</track><url>http://www2007.org/webhistory.php</url><desc>(5:00pm to 8:00pm, starting in Riverview Lounge)</desc><wide>1</wide><name>Web History Reception</name><id></id><type></type><slots><slot>evening</slot></slots></event><event><room>vanhorne</room><day>wed</day><track>plenary</track><url>http://www2007.org/berners-lee.php</url><desc>The Two Magics of Web Science (Van Horne Ballroom)</desc><wide>1</wide><name>WWW2007 Opening Ceremonies and Plenary Speaker: Tim Berners-Lee (MIT/W3C)</name><id></id><type></type><slots><slot>am1</slot></slots></event><event><room>vanhorne</room><day>wed</day><track>exhibition</track><url></url><desc>(Van Horne C)</desc><wide>0</wide><name>Sponsor Exhibition Hall</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>vanhorne</room><day>thu</day><track>exhibition</track><url></url><desc>(Van Horne C)</desc><wide>0</wide><name>Sponsor Exhibition Hall</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>vanhorne</room><day>fri</day><track>exhibition</track><url></url><desc>(Van Horne C)</desc><wide>0</wide><name>Sponsor Exhibition Hall</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>petrak</room><day>wed</day><track>museum</track><url></url><desc>Exhibit</desc><wide>0</wide><name>Web History Display</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>petrak</room><day>thu</day><track>museum</track><url></url><desc>Exhibit</desc><wide>0</wide><name>Web History Display</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>petrak</room><day>fri</day><track>museum</track><url></url><desc>Exhibit</desc><wide>0</wide><name>Web History Display</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>petrak</room><day>sat</day><track>museum</track><url></url><desc>Exhibit</desc><wide>0</wide><name>Web History Display</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>shaughn</room><day>wed</day><track>history</track><url>http://www2007.org/webhistory.php</url><desc>Pioneers of the Web and E-Commerce</desc><wide>0</wide><name>Web History Track</name><id></id><type></type><slots><slot>am2</slot><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>vanhorne</room><day>wed</day><track>misc</track><url></url><desc>(Van Horne Ballroom and Foyer, 6:00pm to 8:00pm)</desc><wide>1</wide><name>WWW2007 Welcome Reception</name><id></id><type></type><slots><slot>evening</slot></slots></event><event><room>beatty</room><day>wed</day><track>paper</track><url></url><desc>1 of 2 (Communication in Developing Regions)</desc><wide>0</wide><name>Technology for Developing Regions</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>beatty</room><day>wed</day><track>paper</track><url></url><desc>2 of 2 (Networking Issues in the Web)</desc><wide>0</wide><name>Technology for Developing Regions</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>coleman</room><day>wed</day><track>panel</track><url>http://www2007.org/panel1.php</url><desc>Phillip Hallam-Baker (Verisign)</desc><wide>0</wide><name>Web Science</name><id></id><type>panel</type><slots><slot>am2</slot></slots></event><event><room>beatty</room><day>wed</day><track>panel</track><url>http://www2007.org/panel2.php</url><desc>Amit Nanavati (IBM India)</desc><wide>0</wide><name>Web Delivery Models for Developing Regions</name><id></id><type>panel</type><slots><slot>pm1</slot></slots></event><event><room>coleman</room><day>wed</day><track>tutorial</track><url>http://www2007.org/tutorial-T9.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Semantic Digital Libraries</name><id>T9</id><type>tutorial</type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>theatre</room><day>wed</day><track>sponsored</track><url>http://www2007.org/industrytopic1.php</url><desc>Search and Data</desc><wide>0</wide><name>Industry Talks</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>newbrunswick</room><day>wed</day><track>paper</track><url></url><desc>1 of 2 (Querying and Transforming XML)</desc><wide>0</wide><name>XML and Web Data</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>newbrunswick</room><day>thu</day><track>paper</track><url></url><desc>2 of 2 (Parsing, Normalizing, and Storing XML)</desc><wide>0</wide><name>XML and Web Data</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>newbrunswick</room><day>wed</day><track>paper</track><url></url><desc>1 of 2 (Personalization)</desc><wide>0</wide><name>Browsers and User Interfaces</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>newbrunswick</room><day>wed</day><track>paper</track><url></url><desc>2 of 2 (Smarter Browsing)</desc><wide>0</wide><name>Browsers and User Interfaces</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>alberta</room><day>wed</day><track>paper</track><url></url><desc>1 of 7 (Search Potpourri)</desc><wide>0</wide><name>Search</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>alberta</room><day>wed</day><track>paper</track><url></url><desc>2 of 7 (Crawlers)</desc><wide>0</wide><name>Search</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>alberta</room><day>wed</day><track>paper</track><url></url><desc>3 of 7 (Web Graphs)</desc><wide>0</wide><name>Search</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>cascade</room><day>wed</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#wednesday</url><desc>Making Mobile Web Browsing Better</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>cascade</room><day>wed</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#wednesday</url><desc>Rich Web Applications</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>cascade</room><day>wed</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#wednesday</url><desc>The Future of the Web Page</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>vanhorne</room><day>thu</day><track>plenary</track><url>http://www2007.org/raghavan.php</url><desc>Web N.0: What sciences will it take? (Van Horne Ballroom)</desc><wide>1</wide><name>WWW2007 Announcements and Plenary Speaker: Prabhakar Raghavan (Yahoo! Research)</name><id></id><type></type><slots><slot>am1</slot></slots></event><event><room>vanhorne</room><day>thu</day><track>misc</track><url></url><desc>(offsite at Brewster's MountView Barbecue; bus transportation provided)</desc><wide>1</wide><name>WWW2007 Banquet (6:00pm to 9:00pm)</name><id></id><type></type><slots><slot>evening</slot></slots></event><event><room>shaughn</room><day>thu</day><track>paper</track><url></url><desc>1 of 2 (Scalable Systems for Dynamic Content)</desc><wide>0</wide><name>Performance and Scalability</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>beatty</room><day>thu</day><track>paper</track><url></url><desc>1 of 1 (Pervasive Web and Mobility)</desc><wide>0</wide><name>Pervasive Web and Mobility</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>newbrunswick</room><day>thu</day><track>paper</track><url></url><desc>1 of 1 (IPE)</desc><wide>0</wide><name>Industrial Practice and Experience</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>coleman</room><day>thu</day><track>devel</track><url>http://www2007.org/panel7.php</url><desc>Paul Miller (Talis)</desc><wide>0</wide><name>Semantic Web with Data</name><id></id><type>panel</type><slots><slot>am2</slot></slots></event><event><room>shaughn</room><day>thu</day><track>panel</track><url>http://www2007.org/panel3.php</url><desc>Arun Iyengar (IBM Research)</desc><wide>0</wide><name>Performance and Scalability</name><id></id><type>panel</type><slots><slot>pm1</slot></slots></event><event><room>shaughn</room><day>thu</day><track>paper</track><url></url><desc>2 of 2 (Performance Engineering of Web Applications)</desc><wide>0</wide><name>Performance and Scalability</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>beatty</room><day>thu</day><track>tutorial</track><url>http://www2007.org/tutorial-T10.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Model Driven Semantic Web Engineering</name><id>T10</id><type>tutorial</type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>coleman</room><day>thu</day><track>devel</track><url>http://www2007.org/prog-Developers.php#thursday</url><desc>Programming the Web</desc><wide>0</wide><name>Developers Track</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>coleman</room><day>thu</day><track>devel</track><url>http://www2007.org/prog-Developers.php#thursday</url><desc>Web Tools</desc><wide>0</wide><name>Developers Track</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>theatre</room><day>thu</day><track>sponsored</track><url>http://www2007.org/industrytopic7.php</url><desc>Web Search</desc><wide>0</wide><name>Industry Talks</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>theatre</room><day>thu</day><track>sponsored</track><url>http://www2007.org/industrytopic2.php</url><desc>Mobile Web and Ajax</desc><wide>0</wide><name>Industry Panel</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>alberta</room><day>thu</day><track>paper</track><url></url><desc>1 of 5 (Identifying Structure in Web Pages)</desc><wide>0</wide><name>Data Mining</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>alberta</room><day>thu</day><track>paper</track><url></url><desc>2 of 5 (Mining Textual Data)</desc><wide>0</wide><name>Data Mining</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>cascade</room><day>thu</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#thursday</url><desc>Advances in Semantic Web</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>cascade</room><day>thu</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#thursday</url><desc>Web of Services for Enterprise Computing</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>cascade</room><day>thu</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#thursday</url><desc>Security and Usability on the Web</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>vanhorne</room><day>fri</day><track>plenary</track><url>http://www2007.org/buxton.php</url><desc>Design for the World Narrow Web (Van Horne Ballroom)</desc><wide>1</wide><name>WWW2007 Announcements and Plenary Speaker: Bill Buxton (Microsoft Research/University of Toronto)</name><id></id><type></type><slots><slot>am1</slot></slots></event><event><room>shaughn</room><day>fri</day><track>paper</track><url></url><desc>1 of 2 (Web Modeling)</desc><wide>0</wide><name>Web Engineering</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>shaughn</room><day>fri</day><track>paper</track><url></url><desc>2 of 2 (End-User Perspective and Measurement in Web Engineering)</desc><wide>0</wide><name>Web Engineering</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>beatty</room><day>fri</day><track>paper</track><url></url><desc>1 of 2 (Orchestration and Choreography)</desc><wide>0</wide><name>Web Services</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>shaughn</room><day>fri</day><track>paper</track><url></url><desc>1 of 3 (Defending Against Emerging Threats)</desc><wide>0</wide><name>Security, Privacy, Reliability, and Ethics</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>shaughn</room><day>fri</day><track>paper</track><url></url><desc>(Defending Against Emerging Threats)(cont'd)</desc><wide>0</wide><name>Security, Privacy, Reliability, and Ethics</name><id></id><type>papers</type><slots><slot>pm3</slot></slots></event><event><room>beatty</room><day>fri</day><track>paper</track><url></url><desc>2 of 2 (SLAs and QoS)</desc><wide>0</wide><name>Web Services</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>beatty</room><day>fri</day><track>paper</track><url></url><desc>(SLAs and QoS) (cont'd)</desc><wide>0</wide><name>Web Services</name><id></id><type>papers</type><slots><slot>pm3</slot></slots></event><event><room>coleman</room><day>fri</day><track>devel</track><url>http://www2007.org/prog-Developers.php#friday</url><desc>Linked Data</desc><wide>0</wide><name>Developers Track</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>coleman</room><day>fri</day><track>tutorial</track><url>http://www2007.org/tutorial-T11.php</url><desc>(Half-day tutorial)</desc><wide>0</wide><name>Semantic Web: Technologies and Applications for the Real-World</name><id>T11</id><type>tutorial</type><slots><slot>pm1</slot><slot>pm2</slot></slots></event><event><room>theatre</room><day>fri</day><track>sponsored</track><url>http://www2007.org/industrytopic3.php</url><desc>Web and Patents</desc><wide>0</wide><name>Industry Talks</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>theatre</room><day>fri</day><track>devel</track><url>http://www2007.org/prog-Developers.php#friday</url><desc>Emerging Web Platforms</desc><wide>0</wide><name>Developers Track</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>theatre</room><day>fri</day><track>devel</track><url>http://www2007.org/prog-Developers.php#friday</url><desc>Next Generation Web Servers</desc><wide>0</wide><name>Developers Track</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>newbrunswick</room><day>fri</day><track>paper</track><url></url><desc>1 of 5 (Applications)</desc><wide>0</wide><name>Semantic Web</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>newbrunswick</room><day>fri</day><track>paper</track><url></url><desc>2 of 5 (Similarity and Extraction)</desc><wide>0</wide><name>Semantic Web</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>newbrunswick</room><day>fri</day><track>paper</track><url></url><desc>3 of 5 (Query Languages and DBs)</desc><wide>0</wide><name>Semantic Web</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>newbrunswick</room><day>fri</day><track>paper</track><url></url><desc>(Query Languages and DBs) (cont'd)</desc><wide>0</wide><name>Semantic Web</name><id></id><type>papers</type><slots><slot>pm3</slot></slots></event><event><room>alberta</room><day>thu</day><track>paper</track><url></url><desc>4 of 7 (Search Quality and Precision)</desc><wide>0</wide><name>Search</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>alberta</room><day>fri</day><track>paper</track><url></url><desc>5 of 7 (Advertisements and Click Estimates)</desc><wide>0</wide><name>Search</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>alberta</room><day>fri</day><track>paper</track><url></url><desc>6 of 7 (Knowledge Discovery)</desc><wide>0</wide><name>Search</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>alberta</room><day>sat</day><track>paper</track><url></url><desc>7 of 7 (Personalization)</desc><wide>0</wide><name>Search</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>alberta</room><day>fri</day><track>paper</track><url></url><desc>3 of 5 (Similarity Search)</desc><wide>0</wide><name>Data Mining</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>cascade</room><day>fri</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#friday</url><desc>A Multimodal Web to Expand Universal Access</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>cascade</room><day>fri</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#friday</url><desc>Architectural Integration</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>cascade</room><day>fri</day><track>W3C</track><url>http://www2007.org/prog-W3CTrack.php#friday</url><desc>Query, Interchange, and Access with XML!</desc><wide>0</wide><name>W3C Track</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>vanhorne</room><day>sat</day><track>plenary</track><url>http://www2007.org/hardt.php</url><desc>An Identity Story (Van Horne Ballroom)</desc><wide>1</wide><name>WWW2007 Announcements and Plenary Speaker: Dick Hardt (SXIP Identity)</name><id></id><type></type><slots><slot>am1</slot></slots></event><event><room>vanhorne</room><day>sat</day><track>misc</track><url></url><desc>5:00pm to 5:30pm</desc><wide>1</wide><name>WWW2007 Closing Ceremony</name><id></id><type></type><slots><slot>evening</slot></slots></event><event><room>alberta</room><day>sat</day><track>paper</track><url></url><desc>4 of 5 (Predictive Modeling of Web Users)</desc><wide>0</wide><name>Data Mining</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>alberta</room><day>sat</day><track>paper</track><url></url><desc>5 of 5 (Mining in Social Networks)</desc><wide>0</wide><name>Data Mining</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>coleman</room><day>sat</day><track>devel</track><url>http://www2007.org/prog-Developers.php#saturday</url><desc>Web Data 1</desc><wide>0</wide><name>Developers Track</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>coleman</room><day>sat</day><track>devel</track><url>http://www2007.org/prog-Developers.php#saturday</url><desc>Web Data 2</desc><wide>0</wide><name>Developers Track</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>coleman</room><day>sat</day><track>devel</track><url></url><desc></desc><wide>0</wide><name>Cancelled</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>theatre</room><day>sat</day><track>sponsored</track><url>http://www2007.org/industrytopic4.php</url><desc>Web Enterprises</desc><wide>0</wide><name>Industry Talks</name><id></id><type></type><slots><slot>am2</slot></slots></event><event><room>theatre</room><day>sat</day><track>sponsored</track><url>http://www2007.org/industrytopic5.php</url><desc>Social Computing</desc><wide>0</wide><name>Industry Panel</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>theatre</room><day>sat</day><track>sponsored</track><url>http://www2007.org/industrytopic6.php</url><desc>New Web-based Business Models: Money or Meaning?</desc><wide>0</wide><name>Industry Panel</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>beatty</room><day>sat</day><track>paper</track><url></url><desc>1 of 2 (E-Communities)</desc><wide>0</wide><name>E* Applications</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>beatty</room><day>sat</day><track>paper</track><url></url><desc>2 of 2 (E-Commerce and E-Content)</desc><wide>0</wide><name>E* Applications</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>theatre</room><day>thu</day><track>panel</track><url>http://www2007.org/panel5.php</url><desc>Greg Conti (USMA)</desc><wide>0</wide><name>Web Search Privacy Issues</name><id></id><type>panel</type><slots><slot>pm2</slot></slots></event><event><room>newbrunswick</room><day>thu</day><track>panel</track><url>http://www2007.org/panel4.php</url><desc>Susan Boll (U. Oldenburg) and Raphael Troncy (CWI)</desc><wide>0</wide><name>Multimedia Metadata Standards in Semantic Web</name><id></id><type>panel</type><slots><slot>pm2</slot></slots></event><event><room>beatty</room><day>fri</day><track>panel</track><url>http://www2007.org/panel6.php</url><desc>Yoelle Marek (Google, Israel)</desc><wide>0</wide><name>Searching Personal Content</name><id></id><type>panel</type><slots><slot>am2</slot></slots></event><event><room>shaughn</room><day>sat</day><track>paper</track><url></url><desc>4 of 5 (Ontologies)</desc><wide>0</wide><name>Semantic Web</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>shaughn</room><day>sat</day><track>paper</track><url></url><desc>5 of 5 (Semantic Web and Web 2.0)</desc><wide>0</wide><name>Semantic Web</name><id></id><type>papers</type><slots><slot>pm1</slot></slots></event><event><room>newbrunswick</room><day>sat</day><track>paper</track><url></url><desc>2 of 3 (Passwords and Phishing)</desc><wide>0</wide><name>Security, Privacy, Reliability, and Ethics</name><id></id><type>papers</type><slots><slot>am2</slot></slots></event><event><room>shaughn</room><day>sat</day><track>paper</track><url></url><desc>3 of 3 (Access Control and Trust on the Web)</desc><wide>0</wide><name>Security, Privacy, Reliability, and Ethics</name><id></id><type>papers</type><slots><slot>pm2</slot></slots></event><event><room>newbrunswick</room><day>sat</day><track>bof</track><url>http://www2007.org/bof1.php</url><desc>The WWW and China</desc><wide>0</wide><name>Birds of a Feather (BOF) Session 1 of 2</name><id></id><type></type><slots><slot>pm1</slot></slots></event><event><room>newbrunswick</room><day>sat</day><track>bof</track><url>http://www2007.org/bof2.php</url><desc>Topic TBD</desc><wide>0</wide><name>Birds of a Feather (BOF) Session 2 of 2</name><id></id><type></type><slots><slot>pm2</slot></slots></event><event><room>beatty</room><day>sat</day><track>misc</track><url></url><desc></desc><wide>0</wide><name>Cancelled</name><id></id><type></type><slots><slot>pm2</slot></slots></event> </events>
 <papers>
<paper><number>15</number><title>Extraction and Classification of Dense Communities in the Web</title><abstract>The World Wide Web (WWW) is rapidly becoming important for society as a medium for sharing data, information and services, and there is a growing interest in tools for understanding collective behaviors and emerging phenomena in the WWW.  In this paper we focus on the problem of searching and classifying {\em communities} in the web. Loosely speaking a community is a group of pages related to a common interest.  More formally communities have been associated in the computer science literature with the existence of a locally dense sub-graph of the web-graph (where web pages are nodes and hyper-links are arcs of the web-graph). The core of our contribution is a new scalable algorithm for finding relatively dense subgraphs in massive graphs.  We apply our algorithm on web-graphs built on three publicly available large crawls of the web (with raw sizes up to 120M nodes and 1G arcs). The effectiveness of our algorithm in finding dense subgraphs is demonstrated experimentally by embedding artificial communities in the web-graph and counting how many of these are blindly found. Effectiveness increases with the size and density of the communities: it is close to 100\% for  communities of a thirty nodes or more (even at low density). It is  still about 80\% even for communities of twenty nodes with  density over $50\%$ of the arcs present. At the lower extremes the algorithm catches 35\% of dense communities made of ten nodes. We complete our Community Watch system by clustering the communities found in the web-graph into homogeneous groups by topic and labelling each group by representative keywords.</abstract><authors><person><name>Yon Dourisboure</name><org>Institute for Informatics and Telematics of C.N.R.</org></person><person><name>Filippo Geraci</name><org>Institute for Informatics and Telematics of C.N.R.</org></person><person><name>Marco Pellegrini</name><org>Institute for Informatics and Telematics of C.N.R.</org></person></authors></paper><paper><number>56</number><title>Robust Methodologies for Modeling Web Click Distributions</title><abstract>Metrics such as click counts are vital to online businesses but their measurement has been problematic due to inclusion of high variance robot traffic. We posit that by applying statistical methods more rigorous than have been employed to date that we can build a robust model of the distribution of clicks following which we can set probabilistically sound thresholds to address outliers and robots. Prior research in this domain has used inappropriate statistical methodology to model distributions and current industrial practice eschews this research for conservative ad-hoc click-level thresholds. Prevailing belief is that such distributions are scale-free power law distributions but using more rigorous statistical methods we find the best description of the data is instead provided by a scale-sensitive Zipf-Mandelbrot mixture distribution. Our results are based on ten datasets from various verticals in the Yahoo domain. Since mixture models can overfit the data we take care to use the BIC log-likelihood method which penalizes overly complex models. Using a mixture model in the web activity domain makes sense because there are likely multiple classes of users. In particular, we have noticed that there is a significantly large set of ``users'' that visit the Yahoo portal exactly once a day. We surmise these may be robots testing internet connectivity by pinging the Yahoo main website.&lt;br /&gt;&lt;br /&gt; Backing up our quantitative analysis is graphical analysis in which empirical distributions are plotted against theoretical distributions in log-log space using robust cumulative distribution plots. This methodology has two advantages: plotting in log-log space allows one to visually differentiate the various exponential distributions and secondly, cumulative plots are much more robust to outliers. We plan to use the results of this work for applications for robot removal from web metrics business intelligence systems.</abstract><authors><person><name>Kamal Ali</name><org>Yahoo</org></person><person><name>Mark Scarr</name><org>Yahoo</org></person></authors></paper><paper><number>57</number><title>DIANE - An Integrated Approach to Automated Service Discovery, Matchmaking and Composition</title><abstract>Automated matching of semantic service descriptions is the key to automatic service discovery and binding. But when trying to find a match for a certain request it may often happen, that the request cannot be serviced by a single offer but could be handled by combining existing offers. In this case automatic service composition is needed. Although automatic composition is an active field of research it is mainly viewed as a planning problem and treated separatedly from service discovery. In this paper we argue that an integrated approach to the problem is better than seperating these issues as is usually done. We propose an approach that integrates service composition into service discovery and matchmaking to match service requests that ask for multiple connected effects, discuss general issues involved in describing and matching such services and present an efficient algorithm implementing our ideas.</abstract><authors><person><name>Ulrich Kuester</name><org>Friedrich Schiller University Jena</org></person><person><name>Birgitta Koenig-Ries</name><org>Friedrich-Schillder-University Jena</org></person><person><name>Mirco Stern</name><org>Universitaet Karlsruhe</org></person><person><name>Michael Klein</name><org>Universitaet Karlsruhe</org></person></authors></paper><paper><number>58</number><title>Answering Relationship Queries on the Web</title><abstract>Finding relationships between entities on the Web, e.g., the connections between different places or the commonalities of people, is a novel and challenging problem. Existing Web search engines excel in keyword matching and document ranking, but they cannot well handle many relationship queries. This paper proposes a new method for answering relationship queries on two entities. Our method first respectively retrieves the top Web pages for either entity from a Web search engine. It then matches these Web pages and generates an ordered list of Web page pairs. Each Web page pair consists of one Web page for either entity. The top ranked Web page pairs are likely to contain the relationships between the two entities. One main challenge in the ranking process is to effectively filter out the large amount of noise in the Web pages without losing much useful information. To achieve this, our method assigns appropriate weights to terms in Web pages and intelligently identifies the potential connecting terms that capture the relationships between the two entities. Only those top potential connecting terms with large weights are used to rank Web page pairs. Finally, the top ranked Web page pairs are presented to the searcher. For each such pair, the query terms and the top potential connecting terms are properly highlighted so that the relationships between the two entities can be easily identified. We implemented a prototype on top of the Google search engine and evaluated it under a wide variety of query scenarios. The experimental results show that our method is effective at finding important relationships with low overhead.</abstract><authors><person><name>Gang Luo</name><org>IBM T.J. Watson Research Center</org></person><person><name>Chunqiang Tang</name><org>IBM T.J. Watson Research Center</org></person><person><name>Ying-li Tian</name><org>IBM T.J. Watson Research Center</org></person></authors></paper><paper><number>67</number><title>Bridging the Gap Between OWL and Relational Databases</title><abstract>Schema statements in OWL are interpreted in a different way from similar statements in a relational database setting. This can lead to problems in data-centric applications, where OWL's interpretation of the statements intended as constraints may be confusing and/or inappropriate. We propose an extension of OWL that attempts to mimic the intuition behind integrity constraints in relational databases. We discuss the algorithms for checking constraint satisfaction for different types of knowledge bases, and show that, provided the constraints are satisfied, we can disregard them while answering a broad range of positive queries.</abstract><authors><person><name>Boris Motik</name><org>University of Manchester</org></person><person><name>Ian Horrocks</name><org>University of Manchester</org></person><person><name>Ulrike Sattler</name><org>University of Manchester</org></person></authors></paper><paper><number>70</number><title>DETECTIVES: DETEcting Coalition hiT Inflation attacks in adVertising nEtworks Streams</title><abstract>Click fraud is jeopardizing the industry of Internet advertising. Internet advertising is crucial for the thriving of the entire Internet, since it allows producers to advertise their products, and hence contributes to the well being of e-commerce. Moreover, advertising supports the intellectual value of the Internet by covering the running expenses of the content publishers' sites. Some publishers are dishonest, and use automation to generate traffic to defraud the advertisers. Similarly, some advertisers automate clicks on the advertisements of their competitors to deplete their competitors' advertising budgets. This paper describes the advertising network model, and focuses on the most sophisticated type of fraud, which involves coalitions among fraudsters. We build on several published theoretical results to devise the Similarity-Seeker algorithm that discovers coalitions made by pairs of fraudsters. We then generalize the solution to coalitions of arbitrary sizes. Before deploying our system on a real network, we conducted comprehensive experiments on data samples for proof of concept. We detected numerous coalitions that span numerous sites. Interestingly, 93% of the discovered sites were real fraudsters.</abstract><authors><person><name>Ahmed Metwally</name><org>University of California, Santa Barbara</org></person><person><name>Divyakant Agrawal</name><org>University of California, Santa Barbara</org></person><person><name>Amr El Abbadi</name><org>University of California, Santa Barbara</org></person></authors></paper><paper><number>89</number><title>Dynamics of Bid Optimization in Online Advertisement Auctions</title><abstract>We consider the problem of online keyword advertising auctions among multiple bidders with limited budgets, and study a natural bidding heuristic in which advertisers attempt to optimize their utility by equalizing their return-on-investment across all keywords.   We show that existing auction mechanisms combined with this heuristic can experience cycling (as has been observed in many current systems), and therefore propose a modified class of mechanisms with small random perturbations. This perturbation is reminiscent of the small time-dependent perturbations employed in the dynamical systems literature to convert many types of chaos into attracting motions. We show that the perturbed mechanism provably converges in the case of first-price auctions and experimentally converges in the case of second-price auctions.  Moreover, the point of convergence has a natural economic interpretation as the unique market equilibrium in the case of first-price mechanisms.  In the case of second-price auctions, we conjecture that it converges to the ``supply-aware'' market equilibrium. Thus, our results can be alternatively described as a tatonnement process for convergence to market equilibrium in which prices are adjusted on the side of the buyers rather than the sellers. We also observe that perturbation in mechanism design is useful in a broader context: In general, it can allow bidders to ``share'' a particular item, leading to stable allocations and pricing for the bidders, and improved revenue for the auctioneer.</abstract><authors><person><name>Christian Borgs</name><org>Microsoft Research</org></person><person><name>Jennifer Chayes</name><org>Microsoft Research</org></person><person><name>Omid Etesami</name><org>U.C. Berkeley</org></person><person><name>Nicole Immorlica</name><org>Microsoft Research</org></person><person><name>Kamal Jain</name><org>Microsoft Research</org></person><person><name>Mohammad Mahdian</name><org>Yahoo! Research</org></person></authors></paper><paper><number>91</number><title>A New Suffix Tree Similarity Measure for Document Clustering</title><abstract>In this paper, we propose a new similarity measure to compute the pairwise similarity of text-based documents based on suffix tree document model. By applying the new suffix tree similarity measure in Group-average Agglomerative Hierarchical Clustering (GAHC) algorithm, we developed a new suffix tree document clustering algorithm (NSTC). Our experimental results on two standard document clustering benchmark corpus OHSUMED and RCV1 indicate that the new clustering algorithm is a very effective document clustering algorithm. Comparing with the results of traditional keyword tfidf similarity measure in the same GHAC algorithm, NSTC achieved an improvement of 51% on the average of F-measure score. Furthermore, we apply the new clustering algorithm in analyzing the Web documents in online forum communities. A topic oriented clustering algorithm is developed to help people in assessing, classifying and searching the the Web documents in a large forum community.</abstract><authors><person><name>Hung Chim</name><org>City University of Hong Kong</org></person><person><name>Xiaotie Deng</name><org>City University of Hong Kong</org></person></authors></paper><paper><number>100</number><title>Extraction and Search of Chemical Formulae in Text Documents on the Web</title><abstract>Often scientists seek to search for articles on the Web related to a particular chemical. When a scientist searches for a chemical formula using a search engine today, she gets back articles where the exact keyword string expressing the chemical formula is found.  Searching for the exact occurrence of keywords while searching results in  two problems for this domain: a) if the author searches for CH4 and the article has H4C, the article is not returned, and b) ambiguous searches like &quot;He&quot; return all documents where Helium is mentioned as well as documents where the pronoun &quot;he&quot; occurs. To remedy these deficiencies, we propose a chemical formula search engine. To build a chemical formula search engine, we must solve the following problems: (1) extract chemical formulae from text documents, (2) index chemical formulae, and (3) design a ranking function for articles where the chemical formulae occur. Furthermore, query models are introduced for formula search, and for each a scoring scheme based on features of partial formulae is proposed to measure the relevance of chemical formulae and queries. We evaluate algorithms for identifying chemical formulae in documents using a classification method based on Support Vector Machines (SVM), and a probabilistic model based on conditional random fields (CRF). Different methods for SVM and CRF to tune the trade-off between recall and precision for imbalanced data are proposed to improve the over-all performance. A feature selection method based on frequency and discrimination is used to remove uninformative and redundant features. Experiments show that our approaches of chemical formula extraction work well, especially after trade-off tuning. The results also demonstrate that feature selection can reduce the index size without changing the ranked query results much.</abstract><authors><person><name>Bingjun Sun</name><org>Pennsylvania State University</org></person><person><name>Qingzhao Tan</name><org>Pennsylvania State University</org></person><person><name>Prasenjit Mitra</name><org>Pennsylvania State University</org></person><person><name>C. Lee Giles</name><org>Pennsylvania State University</org></person></authors></paper><paper><number>111</number><title>Spam Double-Funnel: Connecting Web Spammers with Advertisers</title><abstract>Spammers use questionable search engine optimization (SEO) techniques to promote their spam links into top search results. In this paper, we focus on one prevalent type of spam - redirection spam - where one can identify spam pages by the third-party domains that these pages redirect traffic to. We propose a five-layer, double-funnel model for describing end-to-end redirection spam, present a methodology for analyzing the layers, and identify prominent domains on each layer using two sets of commercial keywords - one targeting spammers and the other targeting advertisers. The methodology and findings are useful for search engines to strengthen their ranking algorithms against spam, for legitimate website owners to locate and remove spam doorway pages, and for legitimate advertisers to identify unscrupulous syndicators who serve ads on spam pages.</abstract><authors><person><name>Yi-Min Wang</name><org>Microsoft Research</org></person><person><name>Ming Ma</name><org>Microsoft Research</org></person><person><name>Yuan Niu</name><org>UC Davis</org></person><person><name>Hao Chen</name><org>UC Davis</org></person></authors></paper><paper><number>120</number><title>Consistency-preserving Caching of Dynamic Database Content</title><abstract>With the growing use of dynamic web content generated from relational databases, traditional caching solutions for throughput and latency improvements are ineffective.  We describe a middleware layer called Ganesh that reduces the volume of data transmitted without semantic interpretation of queries or results.  It achieves this reduction through the use of cryptographic hashing to detect similarities with previous results.  These benefits do not require any compromise of the strict consistency semantics provided by the back-end database. Further, Ganesh does not require modifications to applications, web servers, or database servers, and works with closed-source applications and databases.  Using two benchmarks representative of dynamic web sites, measurements of our prototype show that it can increase end-to-end throughput by as much as twofold for non-data intensive applications and by as much as tenfold for data intensive ones.</abstract><authors><person><name>Niraj Tolia</name><org>Carnegie Mellon University</org></person><person><name>M. Satyanarayanan</name><org>Carnegie Mellon University</org></person></authors></paper><paper><number>127</number><title>Connecting the Bottom of the Pyramid: An Exploratory Case Study of India's Rural Communication Environment</title><abstract>This paper is based on our exploratory study of a South Indian village in Chamrajanagar district of Karnataka. The study was to understand the rural communication environment and villagers' communication preferences. We examined people's lifestyle, working conditions and their communication eco-system. Our study revealed that villagers, unlike urban inhabitants, interacted with people outside the village only for specific, rather than casual purposes. Another interesting aspect of rural communication was the marginal use of the postal system and the ubiquitous use of pay phone, apart from word of mouth and face-to-face interactions. In fact, personal (face-to-face) interaction was usually preferred among villages in this region, over other kinds of communication, despite infrastructural constraints like poor transport services.&lt;br /&gt;&lt;br /&gt; We also observed that communication frequency increased when status quo changed to one that required immediate attention. During the analysis we identified certain social, economic and cultural communication gaps (or problems). However, these problems were clear opportunities to connect the unconnected rural users, by deploying new communication systems and features. Here, we have highlighted some of our findings and possible design avenues based on these findings.</abstract><authors><person><name>Sarita Seshagiri</name><org>Motorola India Research Labs</org></person><person><name>Aman Sagar</name><org>Motorola India Research Labs</org></person><person><name>Dhaval Joshi</name><org>Motorola India Research Labs</org></person></authors></paper><paper><number>140</number><title>Compiling Cryptographic Protocols for Deployment on the Web</title><abstract>Cryptographic protocols are useful for trust engineering in Web transactions.  The Cryptographic Protocol Programming Language (CPPL) provides a model wherein trust management annotations are attached to protocol actions, and are used to constrain the behavior of a protocol participant to be compatible with its own trust policy.&lt;br /&gt;&lt;br /&gt; The first implementation of CPPL generated stand-alone, single-session servers, making it unsuitable for deploying protocols on the Web. We describe a new compiler that uses a constraint-based analysis to produce multi-session server programs.  The resulting programs run without persistent TCP connections for deployment on traditional Web servers.  Most importantly, the compiler preserves existing proofs about the protocols.  We present an enhanced version of the CPPL language, discuss the generation and use of constraints, show their use in the compiler, formalize the preservation of properties, present subtleties, and outline implementation details.</abstract><authors><person><name>Jay McCarthy</name><org>Brown University</org></person><person><name>Joshua D. Guttman</name><org>MITRE Corporation</org></person><person><name>John D. Ramsdell</name><org>MITRE Corporation</org></person><person><name>Shriram Krishnamurthi</name><org>Brown University</org></person></authors></paper><paper><number>145</number><title>A Scalable Application Placement Controller for Enterprise Data Centers</title><abstract>Given a set of machines and a set ofWeb applications with dynamically changing demands, an application placement controller decides how many instances to run for each application and where to put them, while observing all kinds of resource constraints. This problem is NP hard. In this paper, we propose an online algorithm that uses heuristics to efficiently solve this problem. It allows multiple applications to share a single machine, and strives to maximize the total satisfied application demand, to minimize the number of application starts and stops, and to balance the load across machines. It can produce within 30 seconds high-quality solutions for hard placement problems with thousands of machines and thousands of applications. This scalability is crucial for dynamic resource provisioning in large-scale enterprise data centers. Our algorithm significantly and consistently outperforms the existing state-of-the-art algorithm under a wide variety of workloads.</abstract><authors><person><name>Chunqiang Tang</name><org>IBM T.J. Watson Research Center</org></person><person><name>Malgorzata Steinder</name><org>IBM Research</org></person><person><name>Michael Spreitzer</name><org>IBM Research</org></person><person><name>Giovanni Pacifici</name><org>IBM Research</org></person></authors></paper><paper><number>158</number><title>Is High-Quality VoD Feasible using P2P Swarming?</title><abstract>Digital media companies have recently started embracing P2P networks as an alternative content distribution channel. However, the drawback of the current {\em P2P swarming} systems is that users need to download the full video and, hence, wait a long time before they can start watching it. While a lot of effort has gone into optimizing the distribution of large files, little research has been done on how to enable high-quality Video-on-Demand (VoD) functionality with P2P swarming systems. The main challenges reside in ensuring that users can start watching a movie at any point in time, while providing small start-up times, sustainable playback rates and high swarming efficiencies.&lt;br /&gt;&lt;br /&gt; In this work, we explore the feasibility of providing high-quality VoD using P2P mesh-based networks. To this extent, we investigate scheduling and pre-fetching techniques, network coding, and mesh topology management. Using both simulations and results from a real implementation, we provide evidence that high-quality VoD is feasible, and give guidelines to enable play-as-you-download P2P swarming systems with high playback rates and minimum start-up delays.</abstract><authors><person><name>Siddhartha Annapureddy</name><org>New York University</org></person><person><name>Saikat Guha</name><org>Cornell University</org></person><person><name>Dinan Gunawardena</name><org>Microsoft Research</org></person><person><name>Christos Gkantsidis</name><org>Microsoft Research</org></person><person><name>Pablo Rodriguez</name><org>Telefonica Research</org></person></authors></paper><paper><number>161</number><title>Exhibit: Light-weight Structured Data Publishing</title><abstract>It is no surprise that Semantic Web researchers and enthusiasts are excited to publish and accumulate semi-structured data on the Web. But looking beyond our community, we recognize that many, many other people also have structured data and want to publish it in rich browsing interfaces. These small-time authors fall into the same category as those early enthusiasts of the Web who were simply excited by the opportunity of using the new medium to share information that they cared about. With this insight, we create a lightweight structured data publishing framework called Exhibit that duplicates many factors we believe have contributed to the original growth of the Web. We argue that appealing to this segment of the Web population--addressing their publishing needs and desires at very low cost in many aspects--lets us leverage their labor to structure-ize existing content on the Web that has previously been authored in HTML by hand and is remaining hard to harvest automatically.</abstract><authors><person><name>David Huynh</name><org>MIT CSAIL</org></person><person><name>David Karger</name><org>MIT CSAIL</org></person><person><name>Rob Miller</name><org>MIT</org></person></authors></paper><paper><number>162</number><title>Navigation-Aided Retrieval</title><abstract>Users searching for information in hypermedia environments often perform querying followed by manual navigation. Yet, the conventional text/hypertext retrieval paradigm does not explicity take post-query navigation into account. This paper proposes a new retrieval paradigm, called navigation-aided retrieval (NAR), which treats both querying and navigation as first-class activities. In the NAR paradigm, querying is seen as a means to identify starting points for navigation, and navigation is guided based on information supplied in the query. NAR is a generalization of the conventional probabilistic information retrieval paradigm, which implicitly assumes no navigation takes place.&lt;br /&gt;&lt;br /&gt; This paper presents a formal model for navigation-aided retrieval, and reports empirical results that point to the real-world applicability of the model. The experiments were performed over a large Web corpus provided by TREC, using human judgments on a new rating scale developed for navigation-aided retrieval. In the case of ambiguous queries, the new retrieval model identifies good starting points for post-query navigation. For less ambiguous queries that need not be paired with navigation, the output closely matches that of a conventional retrieval system.</abstract><authors><person><name>Shashank Pandit</name><org>Carnegie Mellon University</org></person><person><name>Christopher Olston</name><org>Yahoo! Research</org></person></authors></paper><paper><number>190</number><title>Turning Portlets into Services: The Consumer Profile</title><abstract>Portlets strive to play at the front end the same role that Web services currently enjoy at the back end, namely, enablers of application assembly through reusable services. However, it is well-known in the component community that, the larger the component, the more reduced the reuse. Hence, the coarse-grained nature of portlets (they encapsulate also the presentation layer) can jeopardize this vision of portlets as reusable services. To avoid this situation, this work proposes a perspective shift in portlet development by introducing the notion of organization profile. While the user profile characterises the end user (e.g. age, name, etc), the organization profile captures the idiosyncrasies of the organization through which the portlet is being delivered (e.g. the portal owner) as far as the portlet functionality is concerned. The user profile is dynamic and hence, requires the portlet to be customised at run time. By contrast, the organization profile is known at registration time, and it is not always appropriate/possible to consider it at run time. Rather, it is better to customize the code at development time, and produce an organization-specific portlet which built-in, custom functionality. In this scenario, we no longer have a portlet but a family of portlets, and the portlet provider becomes the &quot;assembly line&quot; of this family. This work promotes this vision by introduces an organization-aware, WSRP-compliant architecture that let portlet consumers registry and handle &quot;family portlets&quot; in the same way that &quot;traditional portlets&quot;. In so doing, portlets are nearer to become truly reusable services.</abstract><authors><person><name>Oscar Diaz</name><org>University of the Basque Country</org></person><person><name>Salvador Trujillo</name><org>University of the Basque Country</org></person><person><name>Sandy Perez</name><org>University of the Basque Country</org></person></authors></paper><paper><number>194</number><title>Do Not Crawl in the DUST: Different URLs with Similar Text</title><abstract>We consider the problem of DUST: Different URLs with Similar Text. Such duplicate URLs are prevalent in web sites, as web server software often uses aliases and redirections, and dynamically generates the same page from various different URL requests. We present a novel algorithm, DustBuster, for uncovering DUST; that is, for discovering rules that transform a given URL to others that are likely to have similar content. DustBuster mines DUST effectively from previous crawl logs or web server logs, without examining page contents. Verifying these rules via sampling requires fetching few actual web pages. Search engines can benefit from information about DUST to increase the effectiveness of crawling, reduce indexing overhead, and improve the quality of popularity statistics such as PageRank.</abstract><authors><person><name>Ziv Bar-Yossef</name><org>Technion - Israel Institute of Technology and Google Haifa</org></person><person><name>Idit Keidar</name><org>Technion - Israel Institute of Technology</org></person><person><name>Uri Schonfeld</name><org>University of California Los Angeles</org></person></authors></paper><paper><number>200</number><title>Towards the Theoretical Foundation of Choreography</title><abstract>With the growth of interest on the web services, people pay more and more attention to choreography, that is, to describe collaborations of participants from a global viewpoint, in accomplishing a common business goal. In this paper, based on a simple choreography languages and a role-oriented process languages, we study some fundamental issues related to choreography, especially related to implementation, including semantics, projection and natural projection, dominant role in choices and iterations, etc. We develop the concept of \emph{dominant role} and propose some novel languages structures related to it. The study reveals some clues about the language, semantics, specification and implementation of choreography.</abstract><authors><person><name>Zongyan Qiu</name><org>Peking University</org></person><person><name>Xiangpeng Zhao</name><org>Peking University</org></person><person><name>Chao Cai</name><org>Peking University</org></person><person><name>Hongli Yang</name><org>Peking University</org></person></authors></paper><paper><number>207</number><title>NetProbe: A Fast and Scalable System for Fraud Detection in Online Auction Networks</title><abstract>Given a large online network of online auction uesrs and their histories of transactions, how can we spot anomalies, or even auction fraud? We describe the algorithms and system design decisions behind our proposed NetProbe system for uncovering auction fraud. We show that it is possible to do fast and scalable fraud detection, in large auction networks. The main idea is to use the machinery of &quot;Markov Random Fields&quot; (MRF), and try to guess the hidden state (fraud/honest) of each participant. We describe the algorithms behind our system, that are based on &quot;belief propagation&quot;; we provide our own incremental but accurate approximations to it; and we list and justify our design decisions for efficient crawling of real auction networks. We report experiments on synthetic graphs containing as many as 7,000 nodes and 30,000 edges, where NetProbe was able to spot fraudulent nodes with over 90 % precision and recall, with execution times in the order of seconds. We also report experiments on a real graph consisting of about 700,000 transactions between more than 66,000 eBay users, where NetProbe was highly effective at unearthing hidden networks of fraudsters, within a realistic response time of about 6 minutes.</abstract><authors><person><name>Shashank Pandit</name><org>Carnegie Mellon University</org></person><person><name>Duen Horng Chau</name><org>Carnegie Mellon University</org></person><person><name>Samuel Wang</name><org>Carnegie Mellon University</org></person><person><name>Christos Faloutsos</name><org>Carnegie Mellon University</org></person></authors></paper><paper><number>215</number><title>Detecting Near-Duplicates for Web Crawling</title><abstract>Near-duplicate documents are  commonly found on the web.   A pair of near-duplicate  web pages  differ from  each other  in a  very small portion.   The differences  commonly consist  of  advertisements and timestamps. Such differences are  irrelevant for web search.  During web  crawling, it  is useful  to quickly  ascertain whether  a newly crawled web  page is  a near-duplicate of  a previously  crawled web page or not.&lt;br /&gt;&lt;br /&gt; In the  course of developing  a practical system  for near-duplicate detection,   we  make   two  research   contributions.    First,  we demonstrate the effectiveness of Charikar's fingerprinting technique for  identifying  near-duplicate web  pages.   We  show  that for  8 billion   web-pages,  a   good  choice   of  parameters   is  64-bit fingerprints  and 3-bit  Hamming distances.   Second, we  present an algorithmic technique for  identifying existing f-bit fingerprints that differ from  a given fingerprint in at  most k bit-positions, for  small k.   Our technique  is useful  for both  online queries (single  fingerprints) and  batch  queries (multiple  fingerprints). Experimental evaluation over real  data confirms the practicality of our design.</abstract><authors><person><name>Gurmeet Manku</name><org>Google Inc.</org></person><person><name>Arvind Jain</name><org>Google Inc.</org></person><person><name>Anish Das Sarma</name><org>Stanford University</org></person></authors></paper><paper><number>216</number><title>Optimized Query Planning of Continuous Aggregation Queries in Dynamic Data Dissemination Networks</title><abstract>Continuous queries are used to monitor changes to time varying data and to provide results useful for online decision making. Typically a user desires to obtain the value of some aggregation function over distributed data items, for example, to know (a) the average of temperatures sensed by a set of sensors (b) the value of index of mid-cap stocks. In these queries a client specifies a coherency or accuracy requirement as part of the query. In this paper we present a low-cost, scalable technique to answer continuous aggregation queries using a content distribution network of dynamic data items. In such a network of data aggregators, each data aggregator serves a set of data items at specific coherencies. Just as various fragments of a dynamic web-page are served by one or more nodes of a CDN, our technique involves decomposing a client query into sub-queries and executing sub-queries on judiciously chosen data aggregators For executing an incoherency bounded continuous query, a query plan is required which includes the set of sub-queries, their individual incoherency bounds and data aggregators which can execute these sub-queries. An optimal query execution plan should satisfy client query's coherency requirement with least cost, measured in terms of the number of refresh messages sent from aggregators to the client. For estimating query execution cost, we build a continuous query cost model which can be used to estimate the number of messages required to satisfy the client specified incoherency bound. Performance results using real-world traces show that our cost based query planning leads to queries being executed using less than one third the number of messages required by existing schemes.</abstract><authors><person><name>Rajeev Gupta</name><org>IBM India Research Lab</org></person><person><name>Krithi Ramamritham</name><org>IIT Bombay</org></person></authors></paper><paper><number>223</number><title>PRIVE: Anonymous Location-Based Queries in Distributed Mobile Systems</title><abstract>Nowadays, mobile users with positioning devices can access Location Based Services (LBS) and query about points of interest in their proximity. For such applications to succeed, privacy and confidentiality are essential. Encryption alone is not adequate; although it safeguards the system against eavesdroppers, the queries themselves may disclose the location and identity of the user. Recently, there have been proposed centralized architectures based on  k-Anonymity, which utilize an intermediate anonymizer between the mobile users and the LBS. However, the anonymizer must be updated continuously with the current locations of all users. Moreover, the complete knowledge of the entire system poses a security threat, if the anonymizer is compromised.&lt;br /&gt;&lt;br /&gt; In this paper we address two issues: (i) We show that existing approaches may fail to provide spatial anonymity for some distributions of user locations and describe a novel technique which solves this problem. (ii) We propose PRIVE, a decentralized architecture for preserving the anonymity of users issuing spatial queries to LBSs. Mobile users self-organize into an overlay network with good fault tolerance and load balancing properties. PRIVE avoids the bottleneck caused by centralized techniques both in terms of anonymization and location updates. Moreover, the status is distributed in numerous users, rendering the system resilient to attacks. Extensive experimental studies suggest that PRIVE is applicable to real-life scenarios with large populations of mobile users.</abstract><authors><person><name>Gabriel Ghinita</name><org>National University of Singapore</org></person><person><name>Panos Kalnis</name><org>National University of Singapore</org></person><person><name>Spiros Skiadopoulos</name><org>University of Peloponnese</org></person></authors></paper><paper><number>225</number><title>Exploring in the Weblog Space by Detecting Informative and Affective Articles</title><abstract>Weblogs have become a prevalent source of information for people to express themselves. In general, there are two genres of contents in weblogs.  The first kind is about the webloggers' personal feelings, thoughts or emotions.  We call this kind of weblogs affective articles. A second kind of weblogs is about technologies and different kinds of informative news. In this paper, we present a machine learning method for classifying informative and affective articles among weblogs. We consider this problem as a binary classification problem. By using machine learning approaches, we achieve 92% on information retrieval performance measures including precision, recall and F1. We set up three studies on the applications of above classification approach in both research and industrial fields. We use the above classification approach to improve the performance of classification of emotions from weblog articles.  We also develop an intent-driven weblog-search engine based on the classification techniques to improve the satisfaction of web users. Finally, we use above classification approach to search for weblogs with a great deal of informative articles.</abstract><authors><person><name>Xiaochuan Ni</name><org>Department of Computer Science and Engineering Shanghai Jiao-Tong University</org></person><person><name>Gui-Rong Xue</name><org>Shanghai Jiao-Tong University</org></person><person><name>Xiao Ling</name><org>Department of Computer Science and Engineering Shanghai Jiao-Tong University</org></person><person><name>Yong Yu</name><org>Shanghai Jiao-Tong University</org></person><person><name>Qiang Yang</name><org>Hong Kong University of Science and Technology</org></person></authors></paper><paper><number>232</number><title>Wherefore Art Thou R3579X?  Anonymized Social Networks, Hidden Patterns, and Structural Steganography</title><abstract>In a social network, nodes correspond to people or other social entities, and edges correspond to social links between them.  In an effort to preserve privacy, the practice of anonymization replaces names with meaningless unique identifiers.  We describe a family of schemes such that even from a single anonymized copy of a social network, it is possible for an adversary to learn whether edges exist or not between specific targeted pairs of nodes.</abstract><authors><person><name>Lars Backstrom</name><org>Cornell University</org></person><person><name>Cynthia Dwork</name><org>Microsoft Research</org></person><person><name>Jon Kleinberg</name><org>Cornell University</org></person></authors></paper><paper><number>247</number><title>Privacy-Enhancing Personalized Web Search</title><abstract>Personalized web search is a promising way to improve search quality by customizing search results for people with individual information goals. However, users are uncomfortable with exposing private preference information to search engines. On the other hand, privacy is not absolute, and often can be compromised if there is a gain in service or profitability to the user. Thus, a balance must be struck between search quality and privacy protection. This paper presents a scalable way for users to automatically build rich user profiles. These profiles summarize a user's interests into a hierarchical organization according to specific interests. Two parameters for specifying privacy requirements are proposed to help the user to choose the content and degree of detail of the profile information that is exposed to the search engine. Experiments showed that the user profile improved search quality when compared to standard MSN rankings. More importantly, results verified our hypothesis that a significant improvement on search quality can be achieved by only sharing some higher-level user profile information, which is potentially less sensitive than detailed personal information.</abstract><authors><person><name>Yabo Xu</name><org>Simon Fraser University</org></person><person><name>Benyu Zhang</name><org>Microsoft Research Asia</org></person><person><name>Zheng Chen</name><org>Microsoft Research Asia</org></person><person><name>Ke Wang</name><org>Simon Fraser University</org></person></authors></paper><paper><number>272</number><title>ActiveRDF: Object-Oriented Semantic Web Programming</title><abstract>Object-oriented programming is the current mainstream programming paradigm but existing RDF APIs are mostly triple-oriented. Traditional techniques for bridging a similar gap between relational databases and object-oriented programs cannot be applied directly, given the different nature of Semantic Web data, as can for example   be seen in the semantics of class membership, inheritance relations, and object conformance to schemas.&lt;br /&gt;&lt;br /&gt; We present ActiveRDF, an object-oriented API for managing RDF data that offers full manipulation and querying of RDF data, does not rely on a schema and fully conforms to RDF(S) semantics. ActiveRDF can be used with different RDF data stores, adapters have been implemented to generic SPARQL endpoints, Sesame, Jena, Redland and YARS and new adapters can be added easily.  In addition, integration with the popular Ruby on Rails framework enables fast development of Semantic Web applications.</abstract><authors><person><name>Eyal Oren</name><org>DERI, National University of Ireland, Galway</org></person><person><name>Renaud Delbru</name><org>DERI NUIG</org></person><person><name>Sebastian Gerke</name><org>DERI NUIG</org></person><person><name>Armin Haller</name><org>DERI NUIG</org></person><person><name>Stefan Decker</name><org>DERI NUIG</org></person></authors></paper><paper><number>279</number><title>XML Design for Relational Storage</title><abstract>Design principles for XML schemas that eliminate redundancies and avoid update anomalies have been studied recently. Several normal forms, generalizing those for relational databases, have been proposed. All of them, however, are based on the assumption of a native XML storage, while in practice most of XML data is stored in relational databases.&lt;br /&gt;&lt;br /&gt; In this paper we study XML design and normalization for relational storage of XML documents. To be able to relate and compare XML and relational designs, we use an information-theoretic framework that measures information content in relations and documents, with higher values corresponding to lower levels of redundancy. We show that most common relational storage schemes preserve the notion of being well-designed (i.e., anomalies- and redundancy-free). Thus, existing XML normal forms guarantee well-designed relational storages as well. We further show that if this perfect option is not achievable, then a slight restriction on XML constraints guarantees a ``second-best'' relational design, according to possible values of the information-theoretic measure. We finally consider an edge-based relational representation of XML documents, and show that while it has similar information-theoretic properties with other relational representations, it can behave significantly worse in terms of enforcing integrity constraints.</abstract><authors><person><name>Solmaz Kolahi</name><org>University of Toronto</org></person><person><name>Leonid Libkin</name><org>University of Edinburgh</org></person></authors></paper><paper><number>286</number><title>Supervised Rank Aggregation</title><abstract>This paper is concerned with rank aggregation, the task of combining results of individual ranking functions in meta-search. Previously, rank aggregation was performed mainly by using unsupervised methods. It is hard for the unsupervised approach to improve ranking performances by leveraging the use of labeled data, when such data is available. We propose employing a supervised learning approach to perform the task, which we refer to as &quot;Supervised Rank Aggregation&quot;. We set up a general framework for conducting rank aggregation with supervised learning, in which learning for rank aggregation is formalized as an optimization issue that minimizes disagreements with the labeled ground truth data. As case study, we focus on Markov Chain based rank aggregation in this paper. The optimization problem is not a convex optimization problem for Markov Chain based methods, however, and thus is hard to solve. We transform the optimization problem into semi-definite programming and give proofs on the correctness. Experimental results on meta-searches show that Supervised Rank Aggregation can significantly outperform existing unsupervised methods.</abstract><authors><person><name>Yu-Ting Liu</name><org>Microsoft Research Asia and Beijing Jiatong University</org></person><person><name>Tie-Yan Liu</name><org>Microsoft Research Asia</org></person><person><name>Tao Qin</name><org>Tsinghua University</org></person><person><name>Zhi-Ming Ma</name><org>Chinese Academy of Science</org></person><person><name>Hang Li</name><org>Microsoft Research Asia</org></person></authors></paper><paper><number>287</number><title>A Mobile Application Framework for the Geospatial Web</title><abstract>In this paper we present an application framework that leverages geospatial content on the World Wide Web by enabling innovative modes of interaction and novel types of user interfaces on advanced mobile phones and PDAs. We discuss the current development steps involved in building mobile geospatial Web applications and derive three technological pre-requisites for our framework: spatial query operations based on visibility and field of view, a 2.5D environment model, and a presentation-independent data exchange format for geospatial query results. We propose the Local Visibility Model as a suitable XML-based candidate and present a prototype implementation.</abstract><authors><person><name>Rainer Simon</name><org>Telecommunications Research Center Vienna</org></person><person><name>Peter Froehlich</name><org>Telecommunications Research Center Vienna</org></person></authors></paper><paper><number>308</number><title>GlobeTP: Template-Based Database Replication for Scalable Web Applications</title><abstract>Generic database replication algorithms do not scale linearly in throughput as they require to apply all update, deletion and insertion (UDI) queries to every database replica. The throughput is therefore limited to the point where the number of UDI queries alone is sufficient to overload one server. In such scenarios, partial replication of a database can help, as update queries are executed only by a subset of all servers. In this paper we propose GlobeTP, a system that employs partial replication to improve database throughput. GlobeTP exploits the fact that a Web application's query workload is composed of a small set of read and write templates. Using knowledge of these templates and their respective execution costs, GlobeTP provides database table placements that produce significant improvements in database throughput. We demonstrate the efficiency of this technique using two different industry standard benchmarks. In our experiments, GlobeTP increases the throughput by 57% to 150% compared to full replication, while using identical hardware configuration. Furthermore, adding a single query cache improves the throughput by another 30% to 60%.</abstract><authors><person><name>Tobias Groothuyse</name><org>Vrije Universiteit, Amsterdam</org></person><person><name>Swaminathan Sivasubramanian</name><org>Vrije Universiteit, Amsterdam</org></person><person><name>Guillaume Pierre</name><org>Vrije Universiteit, Amsterdam</org></person></authors></paper><paper><number>324</number><title>Dynamic Personalized Pagerank in Entity-Relation Graphs</title><abstract>Extractors and taggers turn unstructured text into entity-relation (ER) graphs where nodes are entities (email, paper, person, conference, company) and edges are relations (wrote, cited, works-for). Typed proximity search of the form type=person NEAR company~&quot;IBM&quot;, paper~&quot;XML&quot; is an increasingly useful search paradigm in ER graphs. Proximity search implementations either perform a Pagerank-like computation at query time, which is slow, or precompute, store and combine per-word Pageranks, which can be very expensive in terms of preprocessing time and space. We present HubRank, a new system for fast, dynamic, space-efficient proximity searches in ER graphs. During preprocessing, HubRank computes and indexes certain &quot;sketchy&quot; random walk fingerprints for a small fraction of nodes, carefully chosen using query log statistics. At query time, a small &quot;active&quot; subgraph is identified, bordered by nodes with indexed fingerprints. These fingerprints are adaptively loaded to various resolutions to form approximate personalized Pagerank vectors (PPVs). PPVs at remaining active nodes are now computed iteratively. We report on experiments with CiteSeer's ER graph and millions of real CiteSeer queries. Some representative numbers follow. On our testbed, HubRank preprocesses and indexes 52 times faster than whole-vocabulary PPV computation. A text index is 56 MB. Whole-vocabulary PPVs would consume 102 GB. If PPVs are truncated to 56 MB, precision compared to true Pagerank drops to 0.55; in contrast, HubRank has precision 0.91 at 63 MB. HubRank's average query time is 328 milliseconds; query-time Pagerank computation takes 11 seconds on average.</abstract><authors><person><name>Soumen Chakrabarti</name><org>IIT Bombay</org></person></authors></paper><paper><number>326</number><title>Effort Estimation: How Valuable is it for a Web company to Use a Cross-company Data Set, Compared to Using Its Own Single-company Data Set?</title><abstract>Previous studies comparing the prediction accuracy of effort models built using Web cross- and single-company data sets have been inconclusive, and as such replicated studies are necessary to determine under what circumstances a company can place reliance on a cross-company effort model. This paper therefore replicates a previous study by investigating how successful a cross-company effort model is: i) to  estimate effort for Web projects that belong to a single company and were not used to build the cross-company model; ii) compared to a single-company effort model. Our single-company data set had data on 15 Web projects from a single company and our cross-company data set had data on 68 Web projects from 25 different companies. The effort estimates used in our analysis were obtained by means of two effort estimation techniques, namely forward stepwise regression and case-based reasoning. Our results were similar to those from the replicated study, showing that predictions based on the single-company model were significantly more accurate than those based on the cross-company model.</abstract><authors><person><name>Emilia Mendes</name><org>The University of Auckland</org></person><person><name>Sergio Di Martino</name><org>University of Salerno</org></person><person><name>Filomena Ferrucci</name><org>University of Salerno</org></person><person><name>Carmine Gravino</name><org>University of Salerno</org></person></authors></paper><paper><number>339</number><title>Random Web Crawls</title><abstract>This paper proposes a random Web crawl model. A Web crawl is a (biased and partial) image of the Web. This paper deals with the hyperlink structure, i.e. a Web crawl is a graph, whose vertices are the pages and whose edges are the hypertextual links. Of course a Web crawl has a very particular structure; we recall some known results about it. We then propose a model generating similar structures. Our model simply simulates a crawling, i.e. builds and crawls the graph at the same time. The graphs generated have lot of known properties of Web crawls. Our model is simpler than most random Web graph models, but captures the sames properties. Notice that it modelizes the crawling process instead of the page writting process of Web graph models.</abstract><authors><person><name>Toufik Bennouas</name><org>Criteo R&amp;amp;D</org></person><person><name>Fabien de Montgolfier</name><org>Universite Paris 7</org></person></authors></paper><paper><number>342</number><title>Scaling Up All Pairs Similarity Search</title><abstract>Given a large collection of sparse vector data in a high dimensional space, we investigate the problem of finding all pairs of vectors whose similarity score (as determined by a function such as cosine distance) is above a given threshold.  We propose novel optimization and indexing techniques for this problem, resulting in an algorithm that is both faster and simpler than the previous state-of-the-art approaches.  We demonstrate the effectiveness of our algorithm on the public DBLP dataset, and on two real-world web applications: generating recommendations for the Orkut social network, and computing pairs of similar queries from search snippet data among the 5 million most frequently issued Google queries. Our algorithm is between 5 times to 20 times faster than previous algorithms on these datasets.</abstract><authors><person><name>Roberto Bayardo</name><org>Google</org></person><person><name>Yiming Ma</name><org>U. California Irvine</org></person><person><name>Ramakrishnan Srikant</name><org>Google</org></person></authors></paper><paper><number>366</number><title>Long Distance Wireless Mesh Network Planning: Problem Formulation and Solution</title><abstract>Several research efforts as well as deployments have chosen IEEE 802.11 as a low-cost, long-distance access technology to bridge the digital divide.  In this paper, we consider the important issue of planning such networks to the minimize system cost.  This is a non-trivial task since it involves several sets of variables: the network topology, tower heights, antenna types to be used and their orientations, and radio transmit powers.  The task is further complicated due to the presence of network performance constraints, and  the inter-dependence among the variables.  Our first contribution in this paper is the formulation of this problem in terms of the variables, constraints and the optimization criterion.  Our second contribution is in identifying the dependencies among the variables and breaking-down the problem into four tractable sub-parts. In this process, we extensively use domain knowledge to strike a balance between tractability and practicality.&lt;br /&gt;&lt;br /&gt; We have evaluated the proposed algorithms using random input sets as well as real-life instances with success.  We have been able to show detailed planning of network topology, required tower heights, antenna types, and transmit powers for the Ashwini project, a long distance WiFi network under deployment in Andhra Pradesh, India,  In this case, we are able to achieve within 2\% additional cost of a lower bound estimate.</abstract><authors><person><name>Sayandeep Sen</name><org>IIT Kanpur</org></person><person><name>Bhaskaran Raman</name><org>Indian Institute of Technology, Kanpur</org></person></authors></paper><paper><number>391</number><title>Yago: A Core of Semantic Knowledge - Unifying WordNet and Wikipedia</title><abstract>We present YAGO, a light-weight and extensible ontology with high coverage and quality. YAGO builds on entities and relations and currently contains roughly 900,000 entities and 5,000,000 facts. This includes the Is-A hierarchy as well as non-taxonomic relations between entities (such as hasWonPrize). The facts have been automatically extracted from the unification of Wikipedia and WordNet, using a carefully designed combination of rule-based and heuristic methods described in this paper. The resulting knowledge base is a major step beyond WordNet: in quality by adding knowledge about individuals like persons, organizations, products, etc. with their semantic relationships -- and in quantity by increasing the number of facts by more than an order of magnitude. Our empirical evaluation of fact correctness shows an accuracy of about 95%. YAGO is based on a logically clean model, which is decidable, extensible, and compatible with RDFS. Finally, we show how YAGO can be further extended by state-of-the-art information extraction techniques.</abstract><authors><person><name>Fabian M. Suchanek</name><org>Max-Planck-Institute for Computer Science</org></person><person><name>Gjergji Kasneci</name><org>Max-Planck-Institute for Computer Science</org></person><person><name>Gerhard Weikum</name><org>Max-Planck-Institute for Computer Science</org></person></authors></paper><paper><number>395</number><title>Integrating Value-based Requirement Engineering Models to WebML using VIP Business Modeling Framework</title><abstract>Requirement engineering is emerging as an increasingly important discipline for supporting Web application development, as these are designed to satisfy diverse stakeholder needs, additional functional, information, multimedia and usability requirements as compared to traditional software applications. Moreover, when considering innovative e-commerce applications, value-based requirements engineering is an extremely relevant methodology which exploits the concept of economic value during the requirements engineering activity. In contrast, most of the methodologies proposed for the development of Web applications, primarily focus on the system design, and paying less attention to the requirements engineering, and specifically to value-based requirement engineering. Focusing this aspect, the paper presents integration of value-based requirement engineering models to WebML models using our recently proposed VIP Business Modeling Framework. The integration process is demonstrated using a well-known e-commerce application example by first presenting example VIP business models and then deriving WebML process, structural and other models from these business models.</abstract><authors><person><name>Farooque Azam</name><org>BUAA</org></person><person><name>Zhang Li</name><org>BUAA</org></person><person><name>Rashid Ahmad</name><org>BUAA</org></person></authors></paper><paper><number>397</number><title>Optimizing Web Search Using Social Annotations</title><abstract>This paper explores the use of social annotations to improve web search. Nowadays, many services e.g., del.icio.us have been developed for web users to organize and share their favorite web pages on line by using social annotations. We observed that the social annotations can benefit the web search in two aspects:  1) the annotations are usually good summaries of corresponding web pages; 2) the count of annotations indicates the popularity of web pages. Two novel algorithms are proposed to incorporate these information into page ranking: 1) SocialSimRank (SSR) calculates the similarity between social annotations and web queries; 2) SocialPageRank (SPR) captures the popularity of web pages. Preliminary experimental results show that SSR can find the latent semantic association between queries and annotations, while SPR successfully measures the quality (popularity) of a web page from the web users' perspective. We further empirically evaluate the proposed methods with 50 manually annotated queries and 3000 auto-generated queries, on a dataset consisting of 690,482 web pages with 2,879,614 different annotations [32].  Experiments show that both SSR and SPR benefit the web search significantly. By incorporating both the SPR and SSR features, the quality of search results can be improved by as much as 14.80% and 25.02% compared with the original performance in MAP on two query sets respectively.</abstract><authors><person><name>Shenghua Bao</name><org>Shanghai Jiao Tong University</org></person><person><name>Xiaoyuan Wu</name><org>Shanghai Jiao Tong University</org></person><person><name>Ben Fei</name><org>IBM China Research Lab</org></person><person><name>Gui-Rong Xue</name><org>Shanghai Jiao-Tong University</org></person><person><name>Zhong Su</name><org>IBM China Research Lab</org></person><person><name>Yong Yu</name><org>Shanghai Jiao-Tong University</org></person></authors></paper><paper><number>403</number><title>Answering Bounded Continuous Search Queries in the World Wide Web</title><abstract>Search queries applied to extract relevant information from the World Wide Web over a  period of time may be denoted as continuous search  queries. The improvement of continuous search queries may concern not only the quality of retrieved results but also the freshness of results, i.e. the time between the availability  of a respective data object on the Web and the notification of a user by the  search engine. In some cases a user should be notified immediately since the value of the respective information decreases quickly, as e.g. news about companies that affect the value of respective stocks or sales offers for products that may no longer be available after a short period of time. In the document filtering literature the optimization of such queries is usually based on threshold classification. Documents above a quality threshold are returned to a user. The threshold is tuned in order to optimize the quality of retrieved results. The disadvantage of such approaches is that the amount of information returned to a user may hardly be controlled without further user-interaction. In this paper we consider the optimization of bounded continuous search queries where only the estimated best k elements are returned to a user. We present a new optimization method for bounded continuous search queries based on the optimal stopping theory and compare the new method to methods currently applied by Web search systems. The new method provides results of significantly higher quality for the cases where very fresh results have to be delivered.</abstract><authors><person><name>Dirk Kukulenz</name><org>Institute of Information Systems</org></person><person><name>Alexandros Ntoulas</name><org>Microsoft Search Labs</org></person></authors></paper><paper><number>420</number><title>Reliable QoS Monitoring Based on Client Feedback</title><abstract>Service-level agreements (SLAs) establish a contract between service providers and clients concerning Quality of Service (QoS) parameters. Without proper penalties, service providers have strong incentives to deviate from the advertised QoS, causing losses to the clients. Reliable QoS monitoring (and proper penalties computed on the basis of delivered QoS) are therefore essential for the trustworthiness of a service-oriented environment. In this paper, we present a novel QoS monitoring mechanism based on quality ratings from the clients. A reputation mechanism collects the ratings and computes the actual quality delivered to the clients. The mechanism provides incentives for the clients to report honestly, and pays special attention to minimizing cost and overhead.</abstract><authors><person><name>Radu Jurca</name><org>Ecole Polytechnique Federale de Lausanne EPFL</org></person><person><name>Walter Binder</name><org>University of Lugano</org></person><person><name>Boi Faltings</name><org>Ecole Polytechnique Federale de Lausanne EPFL</org></person></authors></paper><paper><number>428</number><title>Hierarchical, Perceptron-like Learning for Ontology-Based Information Extraction</title><abstract>Recent work on ontology-based Information Extraction (IE) has tried to make an increased use of the knowledge from the target ontology in order to improve the semantic annotation results. However, only very few approaches are able to benefit from the ontology structure and one of them is not a learning system, thus is not easy to adapt to new domains, whereas the other one does not perform semantic annotation of documents, but only ontology population.&lt;br /&gt;&lt;br /&gt; This paper introduces a hierarchical learning approach for IE, which uses the target ontology as an essential part of the extraction process. Hierarchical classification takes into account the relations between concepts, thus benefiting directly from the ontology.&lt;br /&gt;&lt;br /&gt; We also carry out evaluation experiments on the largest available semantically annotated corpus of 146 classes. The results demonstrate clearly the benefits of using knowledge from the ontology for ontology-based IE. We also demonstrate the advantages of our approach over other state-of-the-art learning systems on a commonly used benchmark dataset.</abstract><authors><person><name>Yaoyong Li</name><org>University of Sheffield</org></person><person><name>Kalina Bontcheva</name><org>University of Sheffield</org></person></authors></paper><paper><number>429</number><title>An Adaptive Crawler for Locating Hidden-Web Entry Points</title><abstract>In this paper we describe new adaptive crawling strategies to efficiently locate the entry points to hidden-Web sources. The fact that hidden-Web sources are very sparsely distributed makes the problem of locating them especially challenging. We deal with this problem by using the contents of pages to focus the crawl on a topic; by prioritizing promising links within the topic; and by also following links that may not lead to immediate benefit. We propose a new framework whereby crawlers automatically learn patterns of promising links, and adapt their focus as the crawl progresses, thus greatly reducing the amount of required manual setup and tuning. Our experiments over real Web pages in a representative set of domains indicate that online learning leads to significant gains in harvest rates: the adaptive crawlers retrieve up to three times as many forms as crawlers that use a fixed focus strategy.</abstract><authors><person><name>Luciano Barbosa</name><org>University of Utah</org></person><person><name>Juliana Freire</name><org>University of Utah</org></person></authors></paper><paper><number>433</number><title>Just the Right Amount: Extracting Modules from Ontologies</title><abstract>The ability to extract meaningful fragments from an ontology is key for ontology re-use. We propose a definition of a module that guarantees to completely capture the meaning of a given set of terms, i.e., to include all axioms relevant to the meaning of these terms, and study the problem of extracting minimally sized modules. We show that the problem of deciding if a module is minimal is undecidable even for rather restricted sub-languages of OWL DL. Hence we propose two ``approximations'', i.e., alternative definitions of modules for a vocabulary that still provide the above guarantee, but that are possibly too strict, and that may thus result in larger modules: the first approximation is semantic and can be checked using existing DL reasoners; the second is syntactic, and can be computed in polynomial time.  Finally, we report on an empirical evaluation of our syntactic approximation that demonstrates that the modules we extract are surprisingly small.</abstract><authors><person><name>Bernardo Cuenca Grau</name><org>University of Manchester</org></person><person><name>Ian Horrocks</name><org>University of Manchester</org></person><person><name>Yevgeny Kazakov</name><org>University of Manchester</org></person><person><name>Ulrike Sattler</name><org>University of Manchester</org></person></authors></paper><paper><number>435</number><title>From SPARQL to Rules (and back)</title><abstract>As the data and ontology layers of the Semantic Web stack have achieved a certain level of maturity in standard recommendations such as RDF and OWL, the current focus lies on two related aspects. On the one hand, the definition of a suitable query language for RDF, SPARQL, seems to be close to candidate recommendation status within the W3C. The establishment of the Rules layer on top of the existing stack on the other hand marks the next step to be tackled, where especially languages with their roots in Logic Programming and Deductive Databases are receiving considerable attention. The purpose of this paper is threefold. First, we discuss the formal semantics of SPARQL extending recent results in several ways. Second, we provide translations from SPARQL to Datalog with stratified negation as failure. Third, we propose some useful and easy to implement extensions of SPARQL, based on this translation. As it turns out, the combination serves for direct implementations of SPARQL on top of existing rules engines as well as a basis for more general rules and query languages on top of RDF.</abstract><authors><person><name>Axel Polleres</name><org>DERI, National University of Ireland</org></person></authors></paper><paper><number>447</number><title>A Fault Model and Mutation Testing of Access Control Policies</title><abstract>To increase confidence in the correctness of specified policies, policy developers can conduct policy testing by supplying typical test inputs (requests) and subsequently checking test outputs (responses) against expected ones.  Unfortunately, manual testing is tedious and few tools exist for automated testing of XACML policies.&lt;br /&gt;&lt;br /&gt; We present a fault model for access control policies and a framework to explore it. The framework includes mutation operators used to implement the fault model, mutant generation, equivalent-mutant detection, and mutant-killing determination. This framework allows us to investigate our fault model, evaluate coverage criteria for test generation and selection, and determine a relationship between structural coverage and fault-detection effectiveness. We have implemented the framework and applied it to various XACML policies. Our experimental results offer valuable insights into choosing mutation operators in mutation testing and choosing coverage criteria in test generation and selection.</abstract><authors><person><name>Evan Martin</name><org>North Carolina State University</org></person><person><name>Tao Xie</name><org>North Carolina State University</org></person></authors></paper><paper><number>461</number><title>Internet-Scale Collection of Human-Reviewed Data</title><abstract>Enterprise data processing and content aggregation systems often require extensive use of human reviewed data (e.g. for training and monitoring machine learning-based applications). Today these needs are often met by in-house efforts or offshore contracting. Emerging applications attempt to provide automation for human reviewed data collection at Internet-scale. We conduct extensive experiments to study the effectiveness of one such application. We also study the feasibility of using Yahoo! Answers, a general question-answering forum, for human review data collection.</abstract><authors><person><name>Qi Su</name><org>Yahoo! Inc</org></person><person><name>Dmitry Pavlov</name><org>Yahoo! Inc</org></person><person><name>Jyh-Herng Chow</name><org>Yahoo! Inc</org></person><person><name>Wendell Baker</name><org>Yahoo! Inc</org></person></authors></paper><paper><number>464</number><title>Using Google Distance to Weight Approximate Ontology Matches</title><abstract>Discovering mappings between concept hierarchies is widely regarded as one of the hardest and most urgent problems facing the Semantic Web. The problem is even harder in domains where concepts are inherently vague and ill-defined, and cannot be given a crisp definition. A notion of approximate concept mapping is required in such domains, but until now, no such notion is available.&lt;br /&gt;&lt;br /&gt; The first contribution of this paper is a definition for concepts is decomposed into a number of submappings, and a \emph{sloppiness value} determines the fraction of these submappings that can be ignored when establishing the mapping.&lt;br /&gt;&lt;br /&gt; A potential problem of such a definition is that with an increasing sloppiness value, it will gradually allow mappings between any two arbitrary concepts. To improve on this trivial behaviour, we need to design a heuristic weighting which minimises the sloppiness required to conclude desirable matches, but at the same time maximises the sloppiness required to conclude undesirable matches. The second contribution of this paper is to show that a \emph{Google-based similarity measure} has exactly these desirable properties.&lt;br /&gt;&lt;br /&gt; We establish these results by \emph{experimental validation in the domain of musical genres}. We show that this domain does suffer from ill-defined concepts. We take two real-life genre hierarchies from the Web, we compute approximate mappings between them at varying levels of sloppiness, and we validate our results against a hand-crafted Gold Standard.&lt;br /&gt;&lt;br /&gt; Our method makes use of the huge amount of knowledge that is implicit in the current Web, and exploits this knowledge as a heuristic for establishing approximate mappings between ill-defined concepts.</abstract><authors><person><name>Risto Risto Gligorov</name><org>Philips Research</org></person><person><name>Zharko Aleksovski</name><org>Philips Research</org></person><person><name>Warner ten Kate</name><org>Philips Research</org></person><person><name>Frank van Harmelen</name><org>Vrije Universiteit Amsterdam</org></person></authors></paper><paper><number>468</number><title>A Framework for Rapid Integration of Presentation Components</title><abstract>The development of user interfaces (UIs) is one of the most time-consuming aspects in software development. In this context, the lack of proper reuse mechanisms for UIs is increasingly becoming manifest, especially as software development is more and more moving toward composite applications. In this paper we propose a framework for the integration of stand-alone modules or applications, where integration occurs at the presentation layer. Hence, the final goal is to reduce the effort required for UI development by maximizing reuse.&lt;br /&gt;&lt;br /&gt; The design of the framework is inspired by lessons learned from application integration, appropriately modified to account for the specificity of the UI integration problem. We provide an abstract component model to specify characteristics and behaviors of presentation components and propose an event-based composition model to specify the composition logic. Components and composition are described by means of a simple XML-based language, which is interpreted by a runtime middleware for the execution of the resulting composite application. A proof-of-concept prototype allows us to show that the proposed component model can also easily be applied to existing presentation components, built with different languages and/or component technologies.</abstract><authors><person><name>Jin Yu</name><org>University of New South Wales</org></person><person><name>Boualem Benatallah</name><org>University of New South Wales</org></person><person><name>Regis Saint-Paul</name><org>University of New South Wales</org></person><person><name>Fabio Casati</name><org>University of Trento</org></person><person><name>Florian Daniel</name><org>Politecnico di Milano</org></person><person><name>Maristella Matera</name><org>Politecnico di Milano</org></person></authors></paper><paper><number>469</number><title>Preference-based Selection of Highly Configurable Web Services</title><abstract>A key challenge for dynamic Web service selection is that Web services are typically highly configurable and service requesters often have dynamic preferences on service configurations. Current approaches, such as WS-Agreement, describe Web services by enumerating the various possible service configurations, an inefficient approach when dealing with numerous service attributes with large value spaces. We model Web service configurations and associated prices and preferences more compactly using utility function policies, which also allows us to draw from multi-attribute decision theory methods to develop an algorithm for optimal service selection. In this paper, we present an OWL ontology for the specification of configurable Web service offers and requests, and a flexible and extensible framework for optimal service selection that combines declarative logic-based matching rules with optimization methods, such as linear programming. Assuming additive price/preference functions, experimental results indicate that our algorithm introduces an overhead of only around 2 sec. compared to a random service selection, while giving optimal results. The overhead, as percentage of total time, decreases as the number of offers and configurations increase.</abstract><authors><person><name>Steffen Lamparter</name><org>Institute AIFB, Universitaet Karlsruhe TH</org></person><person><name>Anupriya Ankolekar</name><org>Institute AIFB, Universitaet Karlsruhe TH</org></person><person><name>Rudi Studer</name><org>Institute AIFB, Universitaet Karlsruhe TH</org></person><person><name>Stephan Grimm</name><org>FZI Karlsruhe</org></person></authors></paper><paper><number>479</number><title>Introduction and Evaluation of Martlet, a Scientific Workflow Language for Abstracted Parallelisation</title><abstract>The workflow language Martlet described in this paper implements a new programming model that allows users to write parallel programs and analyse distributed data without having to be aware of the details of the parallelisation. Martlet abstracts the parallelisation of the computation and the splitting of the data through the inclusion of constructs inspired by functional programming. These allow programs to be written as an abstract description that can be adjusted automatically at runtime to match the data set and available resources. Using this model it is possible to write programs to perform complex calculations across a distributed data set such as Singular Value Decomposition or Least Squares problems, as well as creating an intuitive way of working with distributed systems&lt;br /&gt;&lt;br /&gt; Having described and evaluated Martlet against other functional languages for parallel computation, this paper goes on to look at how Martlet might develop. In doing so it covers both possible additions to the language itself, and the use of JIT compilers to increase the range of platforms it is capable of running on.</abstract><authors><person><name>Daniel James Goodman</name><org>Oxford University</org></person></authors></paper><paper><number>481</number><title>P-TAG: Large Scale Automatic Generation of Personalized Annotation TAGs for the Web</title><abstract>The success of the Semantic Web depends on the availability of Web pages annotated with metadata. Free form metadata or tags, as used in social bookmarking and folksonomies based systems, have become more and more popular and successful. Such tags are relevant keywords associated with or assigned to a piece of information (e.g., a Web page), thus describing the item and enabling keyword-based classification. In this paper we propose P-TAG, a method which automatically generates personalized tags for Web pages. Keywords are generated based on the content of the Web page but also based on the content of the user's Desktop, thus expressing a personalized viewpoint very relevant for personal tags. We implemented and tested several algorithms for this approach and evaluated the relevance of the resulting keywords. These evaluations showed very promising results and we are therefore very confident that such a user oriented automatic tagging approach can provide large scale personalized metadata annotation as an important step towards realizing the Semantic Web.</abstract><authors><person><name>Paul - Alexandru Chirita</name><org>L3S / University of Hannover</org></person><person><name>Stefania Costache</name><org>L3S Research Center / University of Hannover</org></person><person><name>Siegfried Handschuh</name><org>National University of Ireland</org></person><person><name>Wolfgang Nejdl</name><org>University of Hannover</org></person></authors></paper><paper><number>485</number><title>A Unified Platform for Data Driven Web Applications with Automatic Client-Server Partitioning</title><abstract>Data-driven web applications are structured into three tiers with different programming models at each tier. This division forces developers to manually partition application functionality across the tiers, resulting in complex logic, suboptimal partitioning, and expensive re-partitioning of applications.&lt;br /&gt;&lt;br /&gt; In this paper, we introduce a unified platform for automatic partitioning of data-driven web applications. Our approach is based on Hilda, a high-level declarative programming language with a unified data and programming model for all the layers of the application. Based on run-time properties of the application, Hilda's run time system automatically partitions the application between the tiers to improve response time while adhering to memory or processing constraints at the clients. We evaluate our methodology with traces from a real application and with TPC-W, and our results show that automatic partitioning outperforms manual partitioning without the associated development overhead.</abstract><authors><person><name>Fan Yang</name><org>Cornell University</org></person><person><name>Nitin Gupta</name><org>Cornell University</org></person><person><name>Nicholas Gerner</name><org>Cornell University</org></person><person><name>Xin Qi</name><org>Cornell University</org></person><person><name>Alan Demers</name><org>Cornell University</org></person><person><name>Johannes Gehrke</name><org>Cornell University</org></person><person><name>Jayavel Shanmugasundaram</name><org>Yahoo!</org></person></authors></paper><paper><number>495</number><title>A Large-scale Evaluation and Analysis of Personalized Search Strategies</title><abstract>Although personalized search has been proposed for many years and many personalization strategies have been investigated, it is still unclear whether personalization is consistently effective on different queries for different users, and under different search contexts. In this paper, we study the problem and get some preliminary conclusions. We present a large-scale personalized search evaluation framework based on search logs and then evaluate five personalized search strategies (including two click-based and three profile-based ones) using 12-day MSN search logs. By analyzing the results, we reveal that personalized search has significant improvement over common web search on some queries but it also has little effect on other queries (e.g., queries with small click entropy) and even harms search accuracy under some situations. Furthermore, we show that click-based personalization strategies perform consistently and considerablely well while profile-based ones are unstable in our experiments. We also reveal that both long-term and short-term contexts are very important in improving search performance for profile-based personalized search strategies.</abstract><authors><person><name>Zhicheng Dou</name><org>Nankai University, China</org></person><person><name>Ruihua Song</name><org>Microsoft Research Asia</org></person><person><name>Ji-Rong Wen</name><org>Microsoft Research Asia</org></person></authors></paper><paper><number>504</number><title>Mapping-Driven XML Transformation</title><abstract>Clio is an existing schema-mapping tool that provides user-friendly means to manage and facilitate the complex task of transformation and integration of heterogeneous data such as XML over the Web or in XML databases. By means of mappings from source to target schemas, Clio can help users conveniently establish the precise semantics of data transformation and integration. In this paper we study the problem of how to efficiently implement such data transformation (i.e., generating target data from the source data based on schema mappings). We present a three-phase framework for high-performance XML-to-XML transformation based on schema mappings, and discuss methodologies and algorithms for implementing these phases. In particular, we elaborate on novel techniques such as streamed extraction of mapped source values and scalable disk-based merging of overlapping data (including duplicate elimination). We compare our transformation framework with alternative methods such as using XQuery or SQL/XML provided by current commercial databases. The results demonstrate that the three-phase framework (although as simple as it is) is highly scalable and outperforms the alternative methods by orders of magnitude.</abstract><authors><person><name>Haifeng Jiang</name><org>IBM Almaden Research Center</org></person><person><name>Howard Ho</name><org>IBM Almaden Research Center</org></person><person><name>Lucian Popa</name><org>IBM Almaden Research Center</org></person><person><name>Wook-Shin Han</name><org>Computer Enigeering Dept., Kyungpook National University, Korea</org></person></authors></paper><paper><number>507</number><title>A High-Performance Interpretive Approach to Schema-Directed Parsing</title><abstract>XML delivers key advantages in interoperability due to its flexibility, expressiveness, and platform-neutrality.  As XML has become a performance-critical aspect of the next generation of business computing infrastructure, however, it has become increasingly clear that XML parsing often carries a heavy performance penalty, and that current, widely-used parsing technologies are unable to meet the performance demands of an XML-based computing infrastructure.  Several efforts have been made to address this performance gap through the use of grammar-based parser generation.  While the performance of generated parsers has been significantly improved, adoption of the technology has been hindered by the complexity of compiling and deploying the generated parsers.  Through careful analysis of the operations required for parsing and validation, we have devised a set of specialized bytecodes, designed for the task of XML parsing and validation.  These bytecodes are designed to engender the benefits of fine-grained composition of parsing and validation that make existing compiled parsers fast, while being coarse-grained enough to minimize interpreter overhead.  This technique of using an interpretive, validating parser balances the need for performance against the requirements of simple tooling and robust scalable infrastructure. Our approach is demonstrated with a specialized schema compiler, used to generate bytecodes which in turn drive an interpretive parser. With almost as little tooling and deployment complexity as a traditional interpretive parser, the bytecode-driven parser usually demonstrates performance within 20% of the fastest fully compiled solutions.</abstract><authors><person><name>Morris Matsa</name><org>IBM</org></person><person><name>Eric Perkins</name><org>IBM</org></person><person><name>Abraham Heifets</name><org>IBM</org></person><person><name>Margaret Gaitatzes Kostoulas</name><org>IBM</org></person><person><name>Daniel Silva</name><org>IBM</org></person><person><name>Noah Mendelsohn</name><org>IBM</org></person><person><name>Michelle Leger</name><org>IBM</org></person></authors></paper><paper><number>511</number><title>Identifying and Discriminating Between Web and Peer-to-Peer Traffic in the Network Core</title><abstract>Traffic classification is the ability to identify and categorize network traffic by application type. In this paper, we consider the problem of traffic classification in the network core. Classification at the core is challenging because only partial information of the flows and their contributors is available. We address this problem by developing and evaluating a classification framework that can classify a flow using only unidirectional flow information. We validated this approach using recent full-payload packet traces that we collected and pre-classified to establish a ``base truth''. From our evaluation, we find that flow statistics along the server-to-client path of a TCP connection provides higher classification accuracy than flow statistics along the client-to-server path. Because collection of the server-to-client flow statistics may not always be feasible, we developed and verified an algorithm that can estimate the missing statistics from a unidirectional packet trace.</abstract><authors><person><name>Jeffrey Erman</name><org>University of Calgary</org></person><person><name>Anirban Mahanti</name><org>University of Calgary</org></person><person><name>Martin Arlitt</name><org>HP Labs/University of Calgary</org></person><person><name>Carey Williamson</name><org>University of Calgary</org></person></authors></paper><paper><number>516</number><title>Expertise Networks in Online Communities: Structure and Algorithms</title><abstract>Web-based communities have become an important place for people to seek and share expertise. We find that networks in these communities typically differ in their topology from other online networks such as the World Wide Web. Systems targeted to augment web-based communities by automatically identifying users with expertise, for example, need to adapt to the underlying interaction dynamics. In this study, we analyze the Java Forum, a large online help-seeking community, using social network analysis methods.  We test a set of network-based ranking algorithms, including PageRank and HITS, on this large size social network in order to identify users with high expertise. We then use simulations to identify a small number of simple rules governing the question-answer dynamic in the network. These simple rules not only replicate the structural characteristics and algorithm performance on the empirically observed Java Forum, but also allow us to evaluate how other algorithms may perform in communities with different characteristics. We believe this approach will be fruitful for practical algorithm design and implementation for online expertise-sharing communities.</abstract><authors><person><name>Jun Zhang</name><org>University of Michigan</org></person><person><name>Mark Ackerman</name><org>University of Michigan</org></person><person><name>Lada Adamic</name><org>University of Michigan</org></person></authors></paper><paper><number>520</number><title>Why We Search: Visualizing and Predicting User Behavior</title><abstract>The aggregation and comparison of behavioral patterns on the WWW represent a tremendous opportunity for understanding past behaviors and predicting future behaviors.  In this paper, we take a first step at achieving this goal.  We present a large scale study correlating the behaviors of Internet users on multiple systems ranging in size from 27 m