Hello, I am trying to parallelize my code and while the rest was easy, I have spent more than 1 month on the BGL part (despite it being only 2% of the runtime) without any success. I have written the following code: using namespace boost; using boost::graph::distributed::mpi_process_group; typedef adjacency_list<vecS, distributedS<mpi_process_group, vecS>, undirectedS> Graph; typedef iterator_property_map<std::vector<int>::iterator, property_map<Graph, vertex_index_t>::type> LocalMap; int getConnectedComponents(std::vector< std::pair<int,int> > &arcs, const std::vector<int> &isActive) { std::vector<int> localComponent(nV+1); std::vector<int> globalComponent(nV+1); Graph G(nV+1); // VERY SLOW!! // Add edges to the graph VERY SLOW!! for(std::vector<std::pair<int,int> >::iterator it = arcs.begin(); it != arcs.end(); ++it) if( isActive[arcCnt++]==0 ) add_edge(vertex(it->first,G),vertex(it->second,G),G); synchronize(G); // Compute number of connected components VERY SLOW with -np 2 LocalMap components(localComponent.begin(),get(vertex_index, G)); num = connected_components_ps(G, components); // Collect the global component vector components.set_max_ghost_cells(0); for( int i = 0; i < nV+1; i++ ) globalComponent[i] = get(components, vertex(i, G)); synchronize(components); } That parallelizes the following implementation: typedef adjacency_list <vecS, vecS, undirectedS> SerialGraph; int getConnectedComponents(std::vector< std::pair<int,int> > &arcs, const std::vector<int> &isActive) { int rank; std::vector<int> globalComponent(nV+1); SerialGraph G(nV+1); MPI_Comm_rank(PETSC_COMM_WORLD, &rank); // Only rank 0 has got the arc data structure if( rank == 0 ) { // Add edges to the graph for(std::vector<std::pair<int,int> >::iterator it = arcs.begin(); it != arcs.end(); ++it) if( isActive[arcCnt++]==0 ) boost::add_edge(it->first,it->second,G); num = connected_components(G, &globalComponent[0]); } // Propagate size for allocation MPI_Bcast( &num, 1, MPI_INT, 0, PETSC_COMM_WORLD ); // Propagate connected components MPI_Bcast( &globalComponent[0], nV+1, MPI_INT, 0, PETSC_COMM_WORLD ); However, the parallel version on a single core is about 5 times slower than the non-parallel one. Moreover, on two cores it becomes 15-20 times slower. What am I doing wrong? Regards, Alessio