@article{jafarian_park_2022, 
  title={Self-supervised 3D Representation Learning of Dressed Humans from Social   Media Videos}, 
  abstractNote={A key challenge of learning a visual representation for the 3D high fidelity
geometry of dressed humans lies in the limited availability of the ground truth
data (e.g., 3D scanned models), which results in the performance degradation of
3D human reconstruction when applying to real-world imagery. We address this
challenge by leveraging a new data resource: a number of social media dance
videos that span diverse appearance, clothing styles, performances, and
identities. Each video depicts dynamic movements of the body and clothes of a
single person while lacking the 3D ground truth geometry. To learn a visual
representation from these videos, we present a new self-supervised learning
method to use the local transformation that warps the predicted local geometry
of the person from an image to that of another image at a different time
instant. This allows self-supervision by enforcing a temporal coherence over
the predictions. In addition, we jointly learn the depths along with the
surface normals that are highly responsive to local texture, wrinkle, and shade
by maximizing their geometric consistency. Our method is end-to-end trainable,
resulting in high fidelity depth estimation that predicts fine geometry
faithful to the input real image. We further provide a theoretical bound of
self-supervised learning via an uncertainty analysis that characterizes the
performance of the self-supervised learning without training. We demonstrate
that our method outperforms the state-of-the-art human depth estimation and
human shape recovery approaches on both real and rendered images.}, 
  author={Jafarian and Park}, 
  year={2022}, 
  month={Dec}
  }